找回密码
 会员注册
查看: 27|回复: 0

python知网爬虫论文pdf下载+立即可用(动态爬虫)

[复制链接]

3

主题

0

回帖

10

积分

新手上路

积分
10
发表于 2024-9-9 13:05:22 | 显示全部楼层 |阅读模式
文章目录使用代码使用自己工作需要,分享出来,刚刚修改完。知需要修改keyword就可以完成自动搜索和下载同时翻页。但是需要安装Chrome,也支持linux爬虫,也要安装linuxChrome非可视化版。代码importselenium.webdriveraswebdriverfromselenium.webdriver.common.byimportByfromselenium.webdriver.common.keysimportKeysfromselenium.webdriverimportChrome,ChromeOptionsimporttimeimportjsonimportpandasaspdimportrequestspapers_info_list=[]one_paper={}keyword="你的query"#搜索关键词#设置options参数,以开发者模式运行option=ChromeOptions()option.add_experimental_option("excludeSwitches",["enable-automation"])#解决报错,设置无界面运行option.add_argument('--no-sandbox')option.add_argument('--disable-dev-shm-usage')option.add_argument('blink-settings=imagesEnabled=false')#不加载图片,提升速度option.add_argument("--headless")option.add_argument('--disable-gpu')#谷歌文档提到需要加上这个属性来规避buguser_agent="Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/118.0.0.0Safari/537.36"option.add_argument(f'user-agent={user_agent}')url="https://kns.cnki.net/kns8s/defaultresult/index?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CEMRPGLPA%2CPWFIRAGL%2CNLBO1Z6R%2CNN3FJMUV&korder=SU&kw="+str(keyword)driver=webdriver.Edge(option)driver.get(url)while(True):#等待新界面加载完毕time.sleep(3)papers=driver.find_elements(By.XPATH,'//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')basestr='//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr'fori,liinenumerate(papers):#passname=li.find_element(By.CSS_SELECTOR,value='td.namea').textname_link=li.find_element(By.CSS_SELECTOR,value='td.namea').get_attribute("href")author=li.find_element(By.CSS_SELECTOR,value='td.author').textsource=li.find_element(By.CSS_SELECTOR,value='td.sourcea').textsource_link=li.find_element(By.CSS_SELECTOR,value='td.sourcea').get_attribute("href")print(source_link)date=li.find_element(By.CSS_SELECTOR,value='td.date').text#发表日期data=li.find_element(By.CSS_SELECTOR,value='td.data').text#数据库来源try:quote=li.find_element(By.CSS_SELECTOR,value='td.quote').textexcept:quote=Nonetry:downloadCount=li.find_element(By.CSS_SELECTOR,value='td.download').textexcept:downloadCount=Nonetryperat=li.find_element(By.CSS_SELECTOR,value='td.operata.downloadlink.icon-download')href=operat.get_attribute("href")#caj下载链接except:href=Noneprint("\n\n\n")print("文章名称:",name)#文章名字print("作者:",author)#作者名字print("文章来源:",source)#文章来源#print(source_link)#期刊链接print("发表日期:",date)#发表日期print("数据库:",data)#数据库ifquote:print("被引次数:",quote)#引用次数ifdownloadCount:print("下载次数:",downloadCount)#下载次数#查看文章详细信息new_driver=webdriver.Chrome(option)new_driver.get(name_link)try:institute=new_driver.find_element(By.CSS_SELECTOR,value='div.briefh3:nth-last-child(1)').text#机构信息except:institute="无机构信息"print("机构:",institute)try:infos=new_driver.find_elements(By.CSS_SELECTOR,value='div.doc-topdiv.row')except:infos=[]forinfoininfos:print(info.text.strip())#摘要、关键词等信息try:pdf_link=new_driver.find_element(By.CSS_SELECTOR,value='#pdfDown').get_attribute("href")except:pdf_link=""print("pdf下载地址:",pdf_link)#pdf下载地址,该pdf地址似乎直接复制到浏览器会报错说应用来源错误...,所以下面直接点击按钮实现自动下载pdftext=requests.get(pdf_link)withopen('./pdf/'+name+'.pdf','wb')asf:f.write(text.content)f.close()time.sleep(3)#等待页面加载完毕new_driver.find_element(By.CSS_SELECTOR,value='#pdfDown').click()time.sleep(3)#等待pdf下载完毕#查看期刊详细信息new_driver2=webdriver.Chrome(option)new_driver2.get(source_link)#infobox=new_driver.find_element(By.XPATH,'//*[@id="qk"]//dd[@class="infobox"]')try:new_driver2.find_element(By.XPATH,'//a[@id="J_sumBtn-stretch"]').click()#展开详细信息except:pass#无需展开try:listbox=new_driver2.find_element(By.XPATH,'//dd[@class="infobox"]/div[@class="listboxclearfix"]')text=listbox.textexcept:text="本期刊缺乏信息"print("--------本期刊详细信息---------")print("期刊名:",source)print(text)#期刊详细信息new_driver2.quit()new_driver.quit()#模拟点击下一页try:driver.find_element(By.XPATH,'//*[@id="PageNext"]').click()except:breakdriver.quit()123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 会员注册

本版积分规则

QQ|手机版|心飞设计-版权所有:微度网络信息技术服务中心 ( 鲁ICP备17032091号-12 )|网站地图

GMT+8, 2025-1-10 06:16 , Processed in 0.918894 second(s), 26 queries .

Powered by Discuz! X3.5

© 2001-2025 Discuz! Team.

快速回复 返回顶部 返回列表