数据挖掘决策树例题经典案例,r语言决策树实例

数据挖掘算法原理与实践：决策树感恩阳博导远程指导第二关：决策树算法原理第三关：着手实现ID3决策树

谢谢阳博导远程指导第二关。决策树算法原理#encoding=utf8import numpy as np# 33660个样本标签33660个样本标签： info entropy (float ) :用于计算熵defcalcinfoentropy(label ) :“” label_set=set ) label info entropy=0fortemp _ labelinlabel _ set : count=0forjinrange (len (label ) ) :iflabel t/len(label ) )熵InfoEntropy -=p * np.log2(p ) p ) * * * * * end * * * * * * returninnng ) ) value (: ' ' input : feature (ndarray ) :样本特性label ) nd array ) :样本标签用于index(int ) :的特征列索引value 幂特征值output:hda(float ) :信息熵“”# * * * * * * begin * * * * * * * count=0# sub _ feature和sub _ lablabe sub _ label=[ ] foriinrange (len (feature ) ) 3360 if feature==value : count=1sub _ feature.append (feature ) len ) feature/len ) ) label ) ) label ) hda=ph a* e # * * * * * * end * * * * * # return hda #计算信息增益defcalcinfogagal 索引) : ' '输入：功能(nd arry ) :测试用例功能标签(nd array ) :词典的标签索引(int ) 3360测试用例例如，index:0意味着使用第一个特征计算信息增益。 output:infogain(float ) :信息增益“' # * * * * * * * begin * * * * * # #给定特征列的值集合base_e=calcinfoentropoint index] ) sum_HDA=0 # )计算信息增益for value INF _ set : sum _ hda=calc hda ) feature，label， label value (info gain=base _ e-sum _ hda (* * * * * * # returninfogain第三关) ID3决策树import numpy as np# 手动实现3360输入：标签(narray ) :样本标签输出： info entropy float ) :熵label _ set=set ) info entropy=0forlinlabel _ set : count=0forjinrange (len ) label ) :iflabel(j )==l:count=1#数据集

InfoEntropy -= p * np.log2(p) return InfoEntropy#计算条件熵def calcHDA(feature,label,index,value): ''' input: feature(ndarray):样本特征 label(ndarray):样本标签 index(int):需要使用的特征列索引 value(int):index所表示的特征列中需要考察的特征值 output: HDA(float):信息熵 ''' count = 0 # sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签 sub_feature = [] sub_label = [] for i in range(len(feature)): if feature[i][index] == value: count += 1 sub_feature.append(feature[i]) sub_label.append(label[i]) pHA = count / len(feature) e = calcInfoEntropy(sub_label) HDA = pHA * e return HDA#计算信息增益def calcInfoGain(feature, label, index): ''' input: feature(ndarry):测试用例中字典里的feature label(ndarray):测试用例中字典里的label index(int):测试用例中字典里的index，即feature部分特征列的索引。该索引指的是feature中第几个特征，如index:0表示使用第一个特征来计算信息增益。 output: InfoGain(float):信息增益 ''' base_e = calcInfoEntropy(label) f = np.array(feature) # 得到指定特征列的值的集合 f_set = set(f[:, index]) sum_HDA = 0 # 计算条件熵 for value in f_set: sum_HDA += calcHDA(feature, label, index, value) # 计算信息增益 InfoGain = base_e - sum_HDA return InfoGain# 获得信息增益最高的特征def getBestFeature(feature, label): ''' input: feature(ndarray):样本特征 label(ndarray):样本标签 output: best_feature(int):信息增益最高的特征 ''' #*********Begin*********# max_infogain = 0 best_feature = 0 for i in range(len(feature[0])): infogain = calcInfoGain(feature, label, i) if infogain > max_infogain: max_infogain = infogain best_feature = i #*********End*********# return best_feature#创建决策树def createTree(feature, label): ''' input: feature(ndarray):训练样本特征 label(ndarray):训练样本标签 output: tree(dict):决策树模型 ''' #*********Begin*********# # 样本里都是同一个label没必要继续分叉了 if len(set(label)) == 1: return label[0] # 样本中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高 if len(feature[0]) == 1 or len(np.unique(feature, axis=0)) == 1: vote = {} for l in label: if l in vote.keys(): vote[l] += 1 else: vote[l] = 1 max_count = 0 vote_label = None for k, v in vote.items(): if v > max_count: max_count = v vote_label = k return vote_label # 根据信息增益拿到特征的索引 best_feature = getBestFeature(feature, label) tree = {best_feature: {}} f = np.array(feature) # 拿到bestfeature的所有特征值 f_set = set(f[:, best_feature]) # 构建对应特征值的子样本集sub_feature, sub_label for v in f_set: sub_feature = [] sub_label = [] for i in range(len(feature)): if feature[i][best_feature] == v: sub_feature.append(feature[i]) sub_label.append(label[i]) # 递归构建决策树 tree[best_feature][v] = createTree(sub_feature, sub_label) #*********End*********# return tree#决策树分类def dt_clf(train_feature,train_label,test_feature): ''' input: train_feature(ndarray):训练样本特征 train_label(ndarray):训练样本标签 test_feature(ndarray):测试样本特征 output: predict(ndarray):测试样本预测标签 ''' #*********Begin*********# result = [] tree = createTree(train_feature,train_label) def classify(tree, feature): if not isinstance(tree, dict): return tree t_index, t_value = list(tree.items())[0] f_value = feature[t_index] if isinstance(t_value, dict): classLabel = classify(tree[t_index][f_value], feature) return classLabel else: return t_value for feature in test_feature: result.append(classify(tree, feature)) predict = np.array(result) #*********End*********# return predict