import time from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from urllib.parse import urljoin import xlwt
desired_capabilities = DesiredCapabilities.CHROME desired_capabilities["pageLoadStrategy"] = "none"
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) options.add_argument("--disable-autocomplete")
driver = webdriver.Chrome(options=options)
source = "经济" theme = "导购"
papers_need = 100 driver.maximize_window()
url="https://kns.cnki.net/kns/advsearch?dbcode=SCDB"
driver.get(url) time.sleep(1) print('已成功打开浏览器,url为:',url)
workbook=xlwt.Workbook() sheet=workbook.add_sheet('Sheet1') sheet.write(0,0,'题名') sheet.write(0,1,'作者') sheet.write(0,2,'来源') sheet.write(0,3,'发表时间') sheet.write(0,4,'数据库') sheet.write(0,5,'被引') sheet.write(0,6,'下载') sheet.write(0,7,'链接')
WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.XPATH, '''//*[@id="magazine_value1"]'''))).send_keys(source) time.sleep(1) print("成功输入来源:",source)
WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[9]"))).click() time.sleep(1)
print("成功确认检索,得到内容.....")
def get_item(num,col=1): index = 1 while True: index%=20 title_XPATH = '//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[2]/a' author_XPATH='//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[3]' source_XPATH='//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[4]' time_XPATH='//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[5]' database_XPATH='//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[6]' refer_XPATH='//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[7]' download_XPATH='//*[@id="gridTable"]/div/div[2]/table/tbody/tr[' + str(index) + ']/td[8]'
try: title = driver.find_element(By.XPATH, title_XPATH).text author=driver.find_element(By.XPATH,author_XPATH).text source = driver.find_element(By.XPATH, source_XPATH).text times = driver.find_element(By.XPATH, time_XPATH).text database = driver.find_element(By.XPATH, database_XPATH).text refer = driver.find_element(By.XPATH, refer_XPATH).text download = driver.find_element(By.XPATH, download_XPATH).text link=driver.find_element(By.XPATH, title_XPATH).get_attribute('href')
print(title,"\t",author,source,times,database,refer,download) if col<num: sheet.write(col, 0, title) sheet.write(col, 1, author) sheet.write(col, 2, source) sheet.write(col, 3, times) sheet.write(col, 4, database) sheet.write(col, 5, refer) sheet.write(col, 6, download) sheet.write(col, 7, link) else: print("-----------------------------------------------------------------------------") print("所有爬取已经结束") return index += 1 col+=1 except: print("当前页爬取结束") index+=1 WebDriverWait(driver, 100).until( EC.presence_of_element_located((By.XPATH, '''//*[@id="PageNext"]'''))).click() time.sleep(1)
print("正在爬取内容.....") print("------------------------------------------------------------------------------") get_item(papers_need)
workbook.save('output.xls') print("爬取结果已储存在文件output.xls中")
driver.close()
|