from random导入样本,random,choice,randint
来自匹配导入中心,日志
来自utils import run _ time
classnode(object ) :
def __init__(self,size ) :
' ' nodeclasstobuildtreeleaveskeywordarguments : size { int }-- nodesize (default : { none } ) ' '
# Node size
self.size=size
# Feature to split
self.split_feature=None
#拆分点
self.split_point=None
#左电池节点
self.left=None
#光周期节点
self.right=None
分类策略树(对象) :
ef_init_(self,x,n_samples,max_depth ) :
' ' isolationtreeclassarguments 3360 x { list }--2dlistwithintorfloatn _ samples { int }-- subsamplesizemax _ depth {
self.height=0
# in case ofn _ samplesisgreaterthann
n=len(x )
if n_samples n:
n_samples=n
#根节点
self.root=node(n_samples )。
#构建隔离树
self._build_tree(x,n_samples,max_depth ) )。
def_get_split(self,x,idx,split_feature ) :
' ' ' randomlychooseasplitpointarguments : x { list }--2dlistobjectwithintorfloatidx { list }--1dlistobjectwithintsplints
# thesplitpointshouldbegreaterthanmin (x [ feature ] ) () () () ) () () ) () ) ) 652
unique=set(map ) lambdai : x [ I ] [ split _ feature ],idx ) )
# Cannot split
iflen(unique )==1:
返回无
unique.remove(min ) unique ) )
x_min,x_max=min(unique ),max (unique ) )。
# caution : random (-xintheinterval [ 0,1 ]。
返回随机(* ) * (x_max - x_min ) x_min
def_build_tree(self,x,n_samples,max_depth ) :
' ' thecurrentnodedataspaceisdividedinto2subspace : lessthanthesplitpointinthespecifieddimensiontheleftchildofthecurrent de de putgreaterthanorequaltosplitpointdataonthecurrentnode ' srightchild.recursivelyconstructnewchildnodesuntilthedatacanotbesplld nodeshavereachedthemax _ depth.arguments : x { list }--2dlistobjectwithintorfloatn _ samples-subsamplesizemax _ demax
#数据shape
(m=len(x[0] ) () ) ) ) ) ) ) ) ) ) ) ) ) 652
n=
len(X)# Randomly selected sample points into the root node of the tree
idx = sample(range(n), n_samples)
# Depth, Node and idx
# BFS
while que and que[0][0] <= max_depth:
depth, nd, idx = que.pop(0)
# Stop split if X cannot be splitted
nd.split_feature = choice(range(m))
nd.split_point = self._get_split(X, idx, nd.split_feature)
if nd.split_point is None:
continue
# Split
idx_left = []
idx_right = []
while idx:
i = idx.pop()
xi = X[i][nd.split_feature]
if xi < nd.split_point:
idx_left.append(i)
else:
idx_right.append(i)
# Generate left and right child
nd.left = Node(len(idx_left))
nd.right = Node(len(idx_right))
# Put the left and child into the que and depth plus one
# Update the height of nqdds/p>
self.height = depth
def _predict(self, xi):
"""Auxiliary function of predict.Arguments:xi{list}-- 1D list with int or floatReturns:int -- the depth of the node which the xi belongs to"""
# Search xi from the IsolationTree until xi is at an leafnode
nd = self.root
depth = 0
while nd.left and nd.right:
if xi[nd.split_feature] < nd.split_point:
nd = nd.left
else:
nd = nd.right
depth += 1
return depth, nd.size
class dddch(object):
def __init__(self):
"""dddch, randomly build some IsolationTree instance,and the average score of each IsolationTreeAttributes:trees{list}-- 1d list with IsolationTree objectsajustment{float}"""
self.trees = None
self.adjustment = None # TBC
def fit(self, X, n_samples=100, max_depth=10, n_trees=256):
"""Build dddch with dataset XArguments:X{list}-- 2d list with int or floatKeyword Arguments:n_samples{int}-- According to paper, set number of samples to 256 (default:{256})max_depth{int}-- Tree height limit (default:{10})n_trees{int}-- According to paper, set number of trees to 100 (default:{100})"""
self.adjustment = self._get_adjustment(n_samples)
self.trees = [IsolationTree(X, n_samples, max_depth)
for _ in range(n_trees)]
def _get_adjustment(self, node_size):
"""Calculate adjustment according to the formula in the paper.Arguments:node_size{int}-- Number of leaf nodesReturns:float -- ajustment"""
if node_size > 2:
i = node_size - 1
ret = 2 * (log(i) + 0.5772156649) - 2 * i / node_size
elif node_size == 2:
ret = 1
else:
ret = 0
return ret
def _predict(self, xi):
"""Auxiliary function of predict.Arguments:xi{list}-- 1d list object with int or floatReturns:list -- 1d list object with float"""
# Calculate average score of xi at each tree
score = 0
n_trees = len(self.trees)
for tree in self.trees:
depth, node_size = tree._predict(xi)
score += (depth + self._get_adjustment(node_size))
score = score / n_trees
# Scale
return 2 ** -(score / self.adjustment)
def predict(self, X):
"""Get the prediction of y.Arguments:X{list}-- 2d list object with int or floatReturns:list -- 1d list object with float"""
return [self._predict(xi) for xi in X]
@run_time
def main():
print("Comparing average score of X and outlier's score...")
# Generate a dataset randomly
n = 100
X = [[random() for _ in range(5)] for _ in range(n)]
# Add outliers
X.append([10]*5)
# Train model
clf = dddch()
clf.fit(X, n_samples=500)
# Show result
print("Average score is%.2f" % (sum(clf.predict(X)) / len(X)))
print("Outlier's score is%.2f" % clf._predict(X[-1]))
if __name__ == "__main__":
main()
版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。