python爬大学官网录取数据,爬虫获取upr教务系统成绩

pyhon3 爬取河海大学URP教务系统,爬取个人成绩,和本学期的成绩GPA,保存到excel

现在不想写,中间有些坑,过几天来聊聊

1.利用百度的ocr,对教务系统的验证码进行识别;
2. 利用urllib库进行模拟登陆和爬取
3. 利用 beautifulSoup对网页进行解析,获取所需数据
4. 利用 xlwt,xlrd,xlutils进行excel的新建和读取和修改

一、爬虫环境搭建

cmd命令pip安装下列模块

pip install bs4pip install lxmlpip install xlwtpip install xlrdpip install xlutilspip install baidu-aippip install pillow

在当前路径下新建一个文件夹data

二、爬虫代码

将学号和密码改成你的

HohaiUrpSpider.py

# encoding: utf-8'''@author: weiyang_tang@contact: weiyang_tang@126.com@file: HohaiUrpSpider_01.py@time: 2019-02-18 16:03@desc: 1.利用百度的ocr,对教务系统的验证码进行识别; 2. 利用urllib库进行模拟登陆和爬取 3. 利用 beautifulSoup对网页进行解析,获取所需数据 4. 利用 xlwt,xlrd,xlutils进行excel的新建和读取和修改'''import urllib.request, urllib.parse, urllib.errorimport http.cookiejarfrom BaiduOcr import getVcodefrom bs4 import BeautifulSoupimport xlwtimport xlrdfrom xlutils.copy import copySNO = '你的学号' # 学号pwd = '你的密码' # 密码SName = '' # 学生姓名不用写capurl = "http://jwurp.hhuc.edu.cn/validateCodeAction.do" # 验证码地址loginUrl = "http://jwurp.hhuc.edu.cn/loginAction.do" # 登陆地址logInMaxTryTimes = 10 # 识别验证码最多多少次试错机会,若超过一个阈值则认为学号和密码不符,停止爬虫getPersonalInfoTimes = 0 # 识别验证码次数,若超过一个阈值则认为学号和密码不符,停止爬虫getGradesTimes = 0#保存登陆成功的Cookiescookie_jar = http.cookiejar.CookieJar()cookie_jar_handler = urllib.request.HTTPCookieProcessor(cookiejar=cookie_jar)opener = urllib.request.build_opener(cookie_jar_handler)picPath = 'D:/image.jpg' # 验证码存放的位置# 登陆教务系统def AutomaticLogin(): # 利用百度ocr识别验证码,为了弥补识别可能出错的缺陷,识别错误多次识别,若多次识别仍是错误,则认为是学号和密码不符 picture = opener.open(capurl).read() local = open(picPath, 'wb') local.write(picture) # 将验证码写入本地 local.close() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', 'Host': 'jwurp.hhuc.edu.cn', 'Origin': 'http://jwurp.hhuc.edu.cn', 'Referer': 'http://jwurp.hhuc.edu.cn/loginAction.do' } code = getVcode() print(code) postdatas = {'zjh': SNO, 'mm': pwd, 'v_yzm': code} # 模拟登陆教务处 data = urllib.parse.urlencode(postdatas).encode(encoding='gb2312') request = urllib.request.Request(loginUrl, data, headers) try: response = opener.open(request) html = response.read().decode('gb2312') # print(html) soup = BeautifulSoup(html, 'lxml') print(soup.title.string) title = soup.title.string if (title.__contains__('错误信息')): # print('登录失败') AutomaticLogin() except urllib.error.HTTPError as e: print(e.code)def getGrades(): global getGradesTimes scoreList = [] # 存放成绩的 AutomaticLogin() # 获取成绩 gradeUrl = 'http://jwurp.hhuc.edu.cn/bxqcjcxAction.do' gradeRequest = urllib.request.Request(gradeUrl) responseGrade = opener.open(gradeRequest).read().decode('gb2312') # print(responseGrade) soup = BeautifulSoup(responseGrade, 'lxml') if (soup.title.string != None): title = soup.title.string if (title.__contains__('错误信息')): getGradesTimes = getGradesTimes + 1 if (getGradesTimes <= logInMaxTryTimes): getGrades() return None else: print('请检查账号和密码是否正确') return None # print(soup.title.string) try: old_excel = xlrd.open_workbook('data/' + SNO + '_' + SName + '.xls', formatting_info=True) except Exception: getPersonalInfo() getGrades() return None new_excel = copy(old_excel) ws = new_excel.add_sheet('本学期成绩') rowIndex = 0 colIndex = 0 for th in soup.find_all(name='th'): ws.write(rowIndex, colIndex, th.string.strip()) colIndex = colIndex + 1 print('%-60s' % th.string.strip(), end=' ') print() rowIndex = 1 for tr in soup.find_all(class_='odd'): scoreList.append([]) colIndex = 0 for td in tr.find_all(name='td'): scoreList[rowIndex - 1].append(td.string.strip()) ws.write(rowIndex, colIndex, td.string.strip()) colIndex = colIndex + 1 print('%-60s' % td.string.strip(), end=' ') rowIndex = rowIndex + 1 print() gpa = getGPA(scoreList) ws.write(rowIndex + 2, colIndex, '本学期平均绩点为' + str(gpa)) print(scoreList) new_excel.save('data/' + SNO + '_' + SName + '.xls')def getPersonalInfo(): global getPersonalInfoTimes personalInfo = [] AutomaticLogin() personalInfoUrl = 'http://jwurp.hhuc.edu.cn/xjInfoAction.do?oper=xjxx' # 个人信息的url gradeRequest = urllib.request.Request(personalInfoUrl) responseGrade = opener.open(gradeRequest).read().decode('gb2312') myWorkbook = xlwt.Workbook() mySheet = myWorkbook.add_sheet('个人信息') rowIndex = 0 soup = BeautifulSoup(responseGrade, 'lxml') if (soup.title.string != None): title = soup.title.string if (title.__contains__('错误信息')): getPersonalInfoTimes = getPersonalInfoTimes + 1 if (getPersonalInfoTimes <= logInMaxTryTimes): getPersonalInfo() return None else: return None for table in soup.find_all(id='tblView'): for tr in table.find_all(name='tr'): personalInfo.append([]) colIndex = 0 for td in tr.find_all(name='td'): if td.string != None: personalInfo[rowIndex].append(td.string.strip()) mySheet.write(rowIndex, colIndex, td.string.strip()) colIndex = colIndex + 1 print(td.string.strip(), end='') rowIndex = rowIndex + 1 print() global SName SName = personalInfo[0][3] myWorkbook.save('data/' + SNO + '_' + SName + '.xls')'''计算本学期的平均绩点'''def getGPA(scoreList): sumCredit = 0 sumPA = 0.0 for rowIndex in range(len(scoreList)): if scoreList[rowIndex][5] == '必修': sumCredit = sumCredit + float(scoreList[rowIndex][4]) sumPA = sumPA + getPA(scoreList[rowIndex][9]) * float(scoreList[rowIndex][4]) try: avgPA = sumPA / sumCredit print('本学期平均绩点为' + str(avgPA)) return avgPA except Exception: passdef getPA(score): if score == "优秀": return 5.0 if score == "良好": return 4.5 if score == "中等": return 3.5 if score == "及格": return 2.5 if score == "不及格": return 0.0 if float(score) >= 90 and float(score) <= 100: return 5.0 if float(score) >= 85 and float(score) <= 89: return 4.5 if float(score) >= 80 and float(score) <= 84: return 4.0 if float(score) >= 75 and float(score) <= 79: return 3.5 if float(score) >= 70 and float(score) <= 74: return 3.0 if float(score) >= 65 and float(score) <= 69: return 2.5 if float(score) >= 60 and float(score) <= 65: return 2.0 if float(score) <= 59: return 0.0if __name__ == '__main__': getPersonalInfo() getGrades()

BaiduOcr.py
利用百度AI的文字识别的API进行验证码识别

# encoding: utf-8'''@author: weiyang_tang@contact: weiyang_tang@126.com@file: BaiduOcr.py@time: 2019-02-09 15:10@desc: 基于百度ocr所做的验证码识别'''from aip import AipOcrfrom PIL import Image# 定义常量APP_ID = '15537967'API_KEY = 'WfwAe7nwLBiLRiEThmQcrsG4'SECRET_KEY = 'G3kHHD2QhvsfVk3jtLmvlR7O5qASXp5l'# 初始化AipFace对象aipOcr = AipOcr(APP_ID, API_KEY, SECRET_KEY)# 读取图片filePath = "D:/image.jpg"def get_file_content(filePath): # im = Image.open('D:/image.jpg') # 1.打开图片 # im.show() with open(filePath, 'rb') as fp: return fp.read()def getVcode(): # 定义参数变量 options = { 'detect_direction': 'true', 'language_type': 'CHN_ENG', } # 调用通用文字识别接口 result = aipOcr.basicGeneral(get_file_content(filePath), options) print(result) keyWord = '' try: if (result["words_result"] != None and result["words_result"] != ''): keyWord = result["words_result"][0]['words'].replace(' ', '') except Exception: pass return keyWordif __name__ == '__main__': code = getVcode() print(code) 三、爬虫成果