Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门
# -*- coding: utf-8 -*-"""Created on Sat May 7 09:32:30 2016@author: daxiong"""import requests,bs4,csv,time,selenium,randomfrom selenium import webdriversite1="http://118.114.237.85:8081/searchbio.aspx"charset="gb2312"pages=196 browser=webdriver.Firefox()browser.get(site1) elems=browser.find_elements_by_class_name("tb")elems1= elems[1:]content=[i.text for i in elems1]#获取一页的表格内容def Get_one_table(): elems=browser.find_elements_by_class_name("tb") elems1= elems[1:] content=[i.text for i in elems1] return content#功能:将list对象N等分 def div_list(ls,n): if not isinstance(ls,list) or not isinstance(n,int): return [] ls_len = len(ls) if n<=0 or 0==ls_len: return [] if n > ls_len: return [] elif n == ls_len: return [[i] for i in ls] else: j = int(ls_len/n) ls_return = [] for i in range(0,(n-1)*j,j): ls_return.append(ls[i:i+j]) #算上末尾的j+k ls_return.append(ls[(n-1)*j:]) return ls_return #把一页内容写入csv文档 def Write_table_to_csv(fileName,list_tableContent): #对列表格式修改,字符串写入的格式不对 file=open(fileName,'w',newline='') writer1=csv.writer(file) writer1.writerows(list_tableContent) file.close() #点击下一页def Click_next_page(): linkElem=browser.find_element_by_link_text("下一页") linkElem.click() for i in range(1,pages+1): list_tableContent=Get_one_table() list_tableContent1=div_list(list_tableContent,20) fileName=str(i)+".csv" Write_table_to_csv(fileName,list_tableContent1) Click_next_page() time.sleep(random.randint(0,5))
版本3
加入多线程采集