技巧篇

豆粕成交量与持仓量爬取

利用webdriver模拟浏览器爬取大连交易所豆粕的成交量和持仓量数据,只有最近一年的数据和2014年数据不能下载,所以写这个脚本。网站封杀爬虫很严重,为了爬一年的数据跑了好几遍,换ip也不行,只好把程序写成一个月一个月的,爬完一个月出了问题删除重新爬。不够智能,但是还算好用,也不知道什么更好的方法,就凑合用吧,先记录下,搞不好以后还得用。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import calendar
import pandas as pd
import random
import time
year=2014
#month=[1,2,3,4,5,6,7,8,9,10,11,12]
month=[5,6,7,8,9,10,11,12]
pinzhong=[1401,1405,1409,1501,1505,1509]
driver= webdriver.Firefox()
wait = WebDriverWait(driver, 30)
filepath='C:\\doupo1.csv'
def monthsumday(year,month):
daysum=calendar.monthrange(year,month)[1]
return daysum
'''def writeexcel(filepath,data,trunk):
try:
writer = pd.ExcelWriter(filepath)
df1 = pd.DataFrame(data)
df1.to_excel(writer,'Sheet1')
writer.save()
except Exception as e:
print("write in excel error:%s"%e)'''
with open(filepath) as f:
datasheet=[]
for imonth in month:
driver.get('http://www.dce.com.cn/dalianshangpin/xqsj/tjsj26/rtj/rxq/index.html')
driver.switch_to.frame(driver.find_element_by_xpath('''//*[@class="container_w"]//iframe'''))
wait.until(EC.element_to_be_clickable((By.XPATH, "/html/body/form[1]/div/div[1]/div[3]/div/ul[2]/li[3]/input"))).click()
driver.find_element_by_xpath("//*[@id='control']/select[1]/*[text()='%s']" % year).click()
everymonth=driver.find_element_by_xpath("//*[@id='control']/select[2]/option[%s]"%imonth)
daysum=monthsumday(year,int(everymonth.text))
everymonth.click()
for day in range(1,daysum+1):
zfullday=str(day).zfill(2)
t=random.uniform(0.5,2)
time.sleep(t)
dateclick=driver.find_element_by_xpath('''//*[@id="calender"]/table[1]/tbody[1]//*[text()='%s']'''%zfullday).click()
#wait.until(EC.presence_of_element_located((By.CLASS_NAME, "dataArea")))
wait.until(EC.presence_of_element_located((By.XPATH, '''//*[@id="calender"]''')))
#品种
for item in pinzhong:
try:
chengjiao = driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody//*[text()='%s']/../td[11]'''%item).text
except:
chengjiao=0
try:
chicang = driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody//*[text()='%s']/../td[12]'''%item).text
except:
chicang=0
data = {'year': year, 'imonth': imonth, 'day': day, 'jiaoge': item, 'chengjiao': chengjiao, 'chicang': chicang}
print('%s月%s日%s成交量%s持仓量%s' % (imonth, day,item, chengjiao, chicang))
# df = pd.DataFrame(data)
df = pd.DataFrame.from_dict(data, orient='index').T
df.to_csv(filepath, mode='a', header=False)
# try:
#     chengjiao=driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody/tr[11]/td[11]''').text
# except:
#     chengjiao=0
# try:
#     chicang=driver.find_element_by_xpath('''//*[@class="dataArea"]//tbody/tr[11]/td[12]''').text
# except:
#     chicang=0
#datasheet.append([year, imonth, day,chengjiao,chicang])
# data={'year':year, 'imonth':imonth, 'day':day,'chengjiao':chengjiao,'chicang':chicang}
# print('%s月%s日1401成交量%s持仓量%s'%(imonth,day,chengjiao,chicang))
#
# #df = pd.DataFrame(data)
# df = pd.DataFrame.from_dict(data, orient='index').T
# df.to_csv('C:\\doupo1.csv', mode='a', header=False)