技巧篇

豆粕成交量与持仓量爬取

利用webdriver模拟浏览器爬取大连交易所豆粕的成交量和持仓量数据,只有最近一年的数据和2014年数据不能下载,所以写这个脚本。网站封杀爬虫很严重,为了爬一年的数据跑了好几遍,换ip也不行,只好把程序写成一个月一个月的,爬完一个月出了问题删除重新爬。不够智能,但是还算好用,也不知道什么更好的方法,就凑合用吧,先记录下,搞不好以后还得用。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import calendar
import pandas as pd
import random
import time
year=2014
#month=[1,2,3,4,5,6,7,8,9,10,11,12]
month=[5,6,7,8,9,10,11,12]
pinzhong=[1401,1405,1409,1501,1505,1509]
driver= webdriver.Firefox()
wait = WebDriverWait(driver, 30)


filepath='C:\\doupo1.csv'


def monthsumday(year,month):
    daysum=calendar.monthrange(year,month)[1]
    return daysum

'''def writeexcel(filepath,data,trunk):
    try:
        writer = pd.ExcelWriter(filepath)
        df1 = pd.DataFrame(data)
        df1.to_excel(writer,'Sheet1')
        writer.save()
    except Exception as e:
        print("write in excel error:%s"%e)'''

with open(filepath) as f:
    datasheet=[]
    for imonth in month:
        driver.get('http://www.dce.com.cn/dalianshangpin/xqsj/tjsj26/rtj/rxq/index.html')
        driver.switch_to.frame(driver.find_element_by_xpath('''//*[@class="container_w"]//iframe'''))
        wait.until(EC.element_to_be_clickable((By.XPATH, "/html/body/form[1]/div/div[1]/div[3]/div/ul[2]/li[3]/input"))).click()
        driver.find_element_by_xpath("//*[@id='control']/select[1]/*[text()='%s']" % year).click()

        everymonth=driver.find_element_by_xpath("//*[@id='control']/select[2]/option[%s]"%imonth)
        daysum=monthsumday(year,int(everymonth.text))
        everymonth.click()
        for day in range(1,daysum+1):
            zfullday=str(day).zfill(2)
            t=random.uniform(0.5,2)
            time.sleep(t)
            dateclick=driver.find_element_by_xpath('''//*[@id="calender"]/table[1]/tbody[1]//*[text()='%s']'''%zfullday).click()
            #wait.until(EC.presence_of_element_located((By.CLASS_NAME, "dataArea")))
            wait.until(EC.presence_of_element_located((By.XPATH, '''//*[@id="calender"]''')))
            #品种
            for item in pinzhong:
                try:
                    chengjiao = driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody//*[text()='%s']/../td[11]'''%item).text
                except:
                    chengjiao=0
                try:
                    chicang = driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody//*[text()='%s']/../td[12]'''%item).text
                except:
                    chicang=0
                data = {'year': year, 'imonth': imonth, 'day': day, 'jiaoge': item, 'chengjiao': chengjiao, 'chicang': chicang}
                print('%s月%s日%s成交量%s持仓量%s' % (imonth, day,item, chengjiao, chicang))

                # df = pd.DataFrame(data)
                df = pd.DataFrame.from_dict(data, orient='index').T
                df.to_csv(filepath, mode='a', header=False)
            # try:
            #     chengjiao=driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody/tr[11]/td[11]''').text
            # except:
            #     chengjiao=0
            # try:
            #     chicang=driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody/tr[11]/td[12]''').text
            # except:
            #     chicang=0
            #datasheet.append([year, imonth, day,chengjiao,chicang])
            # data={'year':year, 'imonth':imonth, 'day':day,'chengjiao':chengjiao,'chicang':chicang}
            # print('%s月%s日1401成交量%s持仓量%s'%(imonth,day,chengjiao,chicang))
            #
            # #df = pd.DataFrame(data)
            # df = pd.DataFrame.from_dict(data, orient='index').T
            # df.to_csv('C:\\doupo1.csv', mode='a', header=False)