kolmogorov湍流(连续数据特征分箱)

ks为Kolmogorov-Smirnov，优势、理论部分仅供参考。 https://blog.titanwolf.in/a？ id=00950-78d 07928-af47-4f 50-a4a0- DC 975 d 0d 74a 2

注意： y只能取0和1。其中1是坏样本，0是好样本。

代码示例来源： https://blog.csdn.net/yeshang _ lady/article/details/112623604

importpandasaspdimportnumpyasnpfromsklearn.datasetsimportmake _ classificationdefuniveral _ df (数据、功能、目标， bad_name ) : " "与统计feature不同值对应的target数feature:分割后的变量target:目标变量的值只能取0和1，0为好的样本" " data=result=data.group by [ feature ] [ target ].agg [ ' count '， ' sum ' ] result=result.sort _ index (bad _ name (result=result.fill na (0) result [ good _ name ]=result [ tood ] resull returnresultdefget _ max _ ks (data _ df，start，end，rate，total_name，good_name，bad _ name (333365365306; 3360 ) #data_cut统计的范围包括end data _ cut [ ' bad _ rate _ cum ' ]=(data _ cut [ bad _ name ]/data _ cut [ bad _ cum ] . sum ().cumsum ) ) data _ cut [ ' total _ cum ' ]=data=total _ all-data _ cut [ total _ name ] data _ Cu t [ ] - data try 3360 cut=data _ cut [ (data _ cut (' total _ cum ' )=limit ) data_cut('total_other_cum ' )=limimii cut_list、total_name、good_name、bad_name ) 336具体判断条件是：是否存在与1个框对应的类别全部为0或1~2的现有框，bad rate为3 woe ' ' ' bad_all=data_df[bad_name].sum () ) ) 652 good _ all=data _ df [ good _ name ].sum (cood ) bad_name].sum(forxincut_list ) ) cut _ good=NP.array ([ data _ df.loc [ x [0] : x [1] )， good_name].sum(forxincut_list ) ) conD1=(0notincut_bad ) and )0notincut_good ) cut _ bad _ rate=cut good

t_woe) and sorted(cut_bad_rate, reverse=True) == list(cut_bad_rate) cond3 = sorted(cut_woe, reverse=True) == list(cut_woe) and sorted(cut_bad_rate, reverse=False) == list(cut_bad_rate) cond4 = sorted(cut_bad_rate, reverse=False) == list(cut_bad_rate) or sorted(cut_bad_rate, reverse=True) == list( cut_bad_rate) return cond1 and cond4def cut_fun(data_df, start, end, rate, total_name, good_name, bad_name): """ 对从start到end这一段数据进行下一步切分，并返回新的割点对 """ cut = get_max_ks(data_df, start, end, rate, total_name, good_name, bad_name) if cut: return [(start, cut), (cut + 1, end)] else: return [(start, end)]def ks_cut_main(data_df, feature, rate, total_name, good_name, bad_name, 冷酷的大白s=None, null_value=False, missing_value=[]): """ 冷酷的大白s: 分箱数。默认为None,即不限定分箱数目。若为int,则为指定的分箱数 null_value: 布尔型。字段中填充的缺失值是否需要单独划分为一箱。若为True,则单独划分为一箱。 missing_value:若null_value为True,则missing_value中的数据即为缺失值,每个缺失值会单独成一箱 """ if null_value and missing_value: data_df = data_df[~data_df[feature].isin(missing_value)] start = data_df.index.min() end = data_df.index.max() cut_list = [(start, end)] # 真正有效的割点集合 tt = cut_list.copy() for cut_seg in tt: cut_冷酷的大白 = cut_fun(data_df, cut_seg[0], cut_seg[1], rate, total_name, good_name, bad_name) if len(cut_冷酷的大白) > 1: temp = cut_list.copy() index_seg = temp.index(cut_seg) temp[index_seg] = cut_冷酷的大白[0] temp.insert(index_seg + 1, cut_冷酷的大白[1]) if verify_cut(data_df, temp, total_name, good_name, bad_name): cut_list = temp tt.extend(cut_冷酷的大白) if 冷酷的大白s and len(cut_list) > 冷酷的大白s: # 判断是否达到限定的分箱数 break # 将割点对转化为对应的数值 # cut_list中保留的割点对中的数据为data_df中的inde，这里想要获得真正的feature的割点数据则需要依据data_df的index找到对应的feature字段的值 cut_list = sorted([-np.inf] + [data_df.loc[item[0], feature] for item in cut_list] + [np.inf] + missing_value) return cut_listdef ks_best_cut(x_value: np.ndarray, y_value: np.ndarray, 冷酷的大白s=None): data = pd.DataFrame([x_value, y_value]).T data.columns = ['x', 'target'] uni_result = univeral_df(data, 'x', 'target', 'total', 'good', 'bad') cut_冷酷的大白_result = ks_cut_main(data_df=uni_result, feature='x', rate=0.05, total_name='total', good_name='good', bad_name='bad', 冷酷的大白s=冷酷的大白s) return cut_冷酷的大白_resultif __name__ == '__main__': data_x, data_y = make_classification(n_samples=1000, n_classes=2, n_features=4, n_informative=2, random_state=0) # data_y中：1为坏样本，0为好样本 ks_cut = ks_best_cut(data_x[:, 0], data_y, 冷酷的大白s=10) # 对A进行分箱 x_冷酷的大白_value = pd.cut(data_x[:, 0], ks_cut, right=False) print(x_冷酷的大白_value.value_counts()) print("类别结果：", x_冷酷的大白_value.codes)

得到结果：

[-inf, -3.107) 0[-3.107, -0.934) 296[-0.934, -0.6) 144[-0.6, -0.177) 53[-0.177, 0.81) 195[0.81, inf) 312dtype: int64类别结果： [5 5 5 4 2 4 1 5 4 1 1 2 5 1 3 2 1 3 1 1 4 5 3 1 4 1 4 5 5 ..... 参考文章 https://blog.csdn.net/yeshang_lady/article/details/112623604https://www.cnblogs.com/wq冷酷的大白/p/10549683.html