京東圖書評論有非常豐富的信息,這裏面就包含了購買日期、書名、作者、好評、中評、差評等等。以購買日期為例,使用Python + Mysql的搭配進行實現,程序不大,才100行。相關的解釋我都在程序裏加註了:
from?selenium?import?webdriver
from?bs4?import?BeautifulSoup
import?re
import?win32com.client
import?threading,time
import?MySQLdb
def?mydebug():
driver.quit()
exit(0)
def?catchDate(s):
"""頁面數據提取"""
soup?=?BeautifulSoup(s)
z?=?[]
global?nowtimes
m?=?soup.findAll("div",class_="date-buy")
for?obj?in?m:
try:
tmp?=?obj.find('br').contents
except?Exception,?e:
continue
if(tmp?!=?""):
z.append(tmp)
nowtimes?+=?1
return?z
def?getTimes(n,t):
"""獲取當前進度"""
return?"當前進度為:"?+?str(int(100*n/t))?+?"%"
#———————————————————————————————————|?程序開始?|—————————————————————————————————
#確定圖書大類
cate?=?{"3273":"歷史","3279":"心理學","3276":"政治軍事","3275":"國學古籍","3274":"哲學宗教","3277":"法律","3280":"文化","3281":"社會科學"}
#斷點續抓
num1?=?input("bookid:")
num2?=?input("pagenumber:")
#生成圖書大類鏈接,***需17355*20?=?347100次
totaltimes?=?347100.0
nowtimes?=?0
#開啟webdirver的PhantomJS對象
#driver?=?webdriver.PhantomJS()
driver?=?webdriver.Ie('C:\Python27\Scripts\IEDriverServer')
#driver?=?webdriver.Chrome('C:\Python27\Scripts\chromedriver')
#讀出Mysql中的評論頁面,進行抓取
# 連接數據庫
try:
conn?=?MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')
except?Exception,?e:
print?e
sys.exit()
# 獲取cursor對象
cursor?=?conn.cursor()
sql?=?"SELECT * FROM booknew ORDER BY pagenumber DESC"
cursor.execute(sql)
alldata?=?cursor.fetchall()
flag?=?0
flag2?=?0
# 如果有數據返回就循環輸出,htt/review/10178500-1-154.html
if?alldata:
for?rec?in?alldata:
#rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber
if(rec[0]?!=?str(num1)?and?flag?==?0):
continue
else:
flag?=?1
for?p?in?range(num2,rec[2]):
if(flag2?==?0):
num2?=?0
flag2?=?1
p?+=?1
link?=?"htteview/"?+?rec[0]?+?"-1-"?+?str(p)?+?".html"
#抓網頁
driver.get(link)
html?=?driver.page_source
#抓評論
buydate?=?catchDate(html)
#寫入數據庫
for?z?in?buydate:
sql?=?"INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, '"?+?rec[0]?+?"','"?+?rec[1]?+?"','"?+?z[0]?+?"');"
try:
cursor.execute(sql)
except?Exception,?e:
print?e
conn.commit()
print?getTimes(nowtimes,totaltimes)
driver.quit()
cursor.close()
conn.close()
京東圖書評論有非常豐富的信息,這裏面就包含了購買日期、書名、作者、好評、中評、差評等等。以購買日期為例,使用Python + Mysql的搭配進行實現,程序不大,才100行。相關的解釋我都在程序裏加註了:
from?selenium?import?webdriver
from?bs4?import?BeautifulSoup
import?re
import?win32com.client
import?threading,time
import?MySQLdb
def?mydebug():
driver.quit()
exit(0)
def?catchDate(s):
"""頁面數據提取"""
soup?=?BeautifulSoup(s)
z?=?[]
global?nowtimes
m?=?soup.findAll("div",class_="date-buy")
for?obj?in?m:
try:
tmp?=?obj.find('br').contents
except?Exception,?e:
continue
if(tmp?!=?""):
z.append(tmp)
nowtimes?+=?1
return?z
def?getTimes(n,t):
"""獲取當前進度"""
return?"當前進度為:"?+?str(int(100*n/t))?+?"%"
#———————————————————————————————————|?程序開始?|—————————————————————————————————
#確定圖書大類
cate?=?{"3273":"歷史","3279":"心理學","3276":"政治軍事","3275":"國學古籍","3274":"哲學宗教","3277":"法律","3280":"文化","3281":"社會科學"}
#斷點續抓
num1?=?input("bookid:")
num2?=?input("pagenumber:")
#生成圖書大類鏈接,***需17355*20?=?347100次
totaltimes?=?347100.0
nowtimes?=?0
#開啟webdirver的PhantomJS對象
#driver?=?webdriver.PhantomJS()
driver?=?webdriver.Ie('C:\Python27\Scripts\IEDriverServer')
#driver?=?webdriver.Chrome('C:\Python27\Scripts\chromedriver')
#讀出Mysql中的評論頁面,進行抓取
# 連接數據庫
try:
conn?=?MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')
except?Exception,?e:
print?e
sys.exit()
# 獲取cursor對象
cursor?=?conn.cursor()
sql?=?"SELECT * FROM booknew ORDER BY pagenumber DESC"
cursor.execute(sql)
alldata?=?cursor.fetchall()
flag?=?0
flag2?=?0
# 如果有數據返回就循環輸出,httreview/10178500-1-154.html
if?alldata:
for?rec?in?alldata:
#rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber
if(rec[0]?!=?str(num1)?and?flag?==?0):
continue
else:
flag?=?1
for?p?in?range(num2,rec[2]):
if(flag2?==?0):
num2?=?0
flag2?=?1
p?+=?1
link?=?"ht.com/review/"?+?rec[0]?+?"-1-"?+?str(p)?+?".html"
#抓網頁
driver.get(link)
html?=?driver.page_source
#抓評論
buydate?=?catchDate(html)
#寫入數據庫
for?z?in?buydate:
sql?=?"INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, '"?+?rec[0]?+?"','"?+?rec[1]?+?"','"?+?z[0]?+?"');"
try:
cursor.execute(sql)
except?Exception,?e:
print?e
conn.commit()
print?getTimes(nowtimes,totaltimes)
driver.quit()
cursor.close()
conn.close()