从网站获取培训示例数据,代码:
import urllib2
frombeautifulsoupimportbeautifulsoup
导入系统
导入re
导入时间
导入系统
重新加载(sys ) )。
sys.set default编码(' utf-8 ' ) )。
url=['http://news.baidu.com/n? cmd=4class=milpn=1from=tab ',' http://news.baidu.com/n? cmd=4class=finannewspn=1from=tab ',' http://news.baidu.com/n? cmd=4class=互联网pn=1from=tab ',' http://news.baidu.com/n? cmd=4class=housenewspn=1from=tab ',' http://news.baidu.com/n? cmd=4class=auto news pn=1from=tab ',' http://news.baidu.com/n? cmd=4class=sport news pn=1from=tab ',' http://news.baidu.com/n? cmd=4class=enternewspn=1from=tab ',' http://news.baidu.com/n? cmd=4class=gamenews pn=1from=tab ',' http://news.baidu.com/n? cmd=4class=edunewspn=1from=tab ',' http://news.baidu.com/n? cmd=4class=healthnewspn=1from=tab ',' http://news.baidu.com/n? cmd=4class=technnewspn=1from=tab ',' http://news.baidu.com/n? cmd=4class=socianewspn=1from=tab ' ]
ff=['E:/baidu/军事. txt ',' E:/baidu/财经. txt ',' E:/baidu/互联网. txt ',' e :/Baidu//
forjinrange (7,8 ) :
soup=beautiful soup (urllib2. urlopen (URL [ j ].read ) ) )
main=soup.find('div ',{'class':'p2'} )
index=main.findall(a ) )。
len _0=len (索引)
a=[]
forIinrange(len_0) :
a.append (索引[ I ] [ ' href ' ] ) )
forIinrange(len_0) :
try:
soup=beautiful soup (urllib2. urlopen (a [ I ] ).read ) )
txt=soup.find all (text=re.com pile (ur ' [u4e 00- u9fa5] ) )
TXT_=''.join(txt ) ) ) ) )。
f=open(ff[j],' a ' ) )
print f,txt_
f.close () )
except:
连续
有监督学习的文本分类代码:
import jieba
导入操作系统
导入系统
导入编码
fromsklearnimportfeature _ extraction
from sklearn import svm
froms klearn.neighborsimportkneighborsclassifier
froms klearn.feature _ extraction.textimporttfidftransformer
froms klearn.feature _ extraction.textimportcountvectorizer
from sklearn impo
rt treefrom sklearn.naive_bayes import MultinomialNB
#--------------#
def load_data():
corpus_train=[]
target_train=[]
filepath='E:python_pananteng/程序6:文本挖掘/文本分类/实例2/train'
filelist = os.listdir(filepath)
for num in range(len(filelist)):
filetext=filepath '/' filelist[num]
filename=os.path.basename(filetext)
myfile = codecs.open(filetext, 'r','utf-8')
temp=myfile.readlines()
myfile.close()
for i in range(0,100):
len_0=len(temp)
seg_list=jieba.cut(','.join(temp[int(i*len_0/100):int((i 1)*len_0/100)]), cut_all=False)
words=' '.join(seg_list)
target_train.append(filename)
corpus_train.append(words)
#--------------#
corpus_test=[]
target_test=[]
filepath='E:python_pananteng/程序6:文本挖掘/文本分类/实例2/test'
filelist = os.listdir(filepath)
for num in range(len(filelist)):
filetext=filepath '/' filelist[num]
myfile = open(filetext, 'r')
temp=myfile.readlines()
myfile.close()
seg_list=jieba.cut(','.join(temp[1:]), cut_all=False)
words=' '.join(seg_list)
target_test.append(temp[0])
corpus_test.append(words)
return [[corpus_train,target_train],[corpus_test,target_test]]
#--------------#
def data_pro():
[[corpus_train,target_train],[corpus_test,target_test]]=load_data()
count_v1=CountVectorizer()
#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
counts_train=count_v1.fit_transform(corpus_train)
#fit_transform是将文本转为词频矩阵
transformer=TfidfTransformer()
#该类会统计每个词语的tf-idf权值
tfidf_train=transformer.fit(counts_train).transform(counts_train)
#fit_transform是计算tf-idf
weight_train=tfidf_train.toarray()
#weight[i][j],第i个文本,第j个词的tf-idf值
count_v2=CountVectorizer(vocabulary=count_v1.vocabulary_)
#让两个CountVectorizer共享vocabulary
counts_test=count_v2.fit_transform(corpus_test)
#fit_transform是将文本转为词频矩阵
transformer=TfidfTransformer()
#该类会统计每个词语的tf-idf权值
tfidf_test=transformer.fit(counts_train).transform(counts_test)
#fit_transform是计算tf-idf
weight_test=tfidf_test.toarray()
#weight[i][j],第i个文本,第j个词的tf-idf值
return [[weight_train,target_train],[weight_test,target_test]]
#--------------#
[[weight_train,target_train],[weight_test,target_test]]=data_pro()
#---------------------------------------------#
knnclf = KNeighborsClassifier()
knnclf.fit(weight_train,target_train)
knn_pred = knnclf.predict(weight_test)
#knn模型
#---------------------------------------------#
#---------------------------------------------#
#svm模型
svc = svm.SVC(kernel='linear')
svc.fit(weight_train,target_train)
svc_pred = svc.predict(weight_test)
#---------------------------------------------#
#---------------------------------------------#
#tree模型
tre = tree.DecisionTreeClassifier()
tre.fit(weight_train,target_train)
tre_pred = tre.predict(weight_test)
#---------------------------------------------#
#---------------------------------------------#
#bayes模型
bayes = MultinomialNB(alpha = 0.01)
bayes.fit(weight_train,target_train)
bayes_pred = bayes.predict(weight_test)
#---------------------------------------------#
调用两个开源库,分别是
1、结巴中文分词库,运用该库对网页抓取的中文文章进行分词
2、sklearn机器学习库,调用里面的算法有:tf-idf算法,将文本转换为特征数字矩阵;及knn算法、svm算法、naivebeyes算法、cart算法,这三个算法都是分类的算法,作用是对网页抓取的文章进行有监督的分类学习
效果:
训练:样本 1001 个,其中有3类文章,
第一类,互联网类,样本数量 300个
第二类,军事类,样本数量309个
第三类,财经类,样本数量302个
测试:150个测试样本
KNN算法,命中118个,错误32个
SVM算法,命中125个,错误25个
CART算法,命中122个,错误28个
Bayes算法,命中130个,错误20个