當前位置:成語大全網 - 古籍修復 - 如何用python爬取壹本書的評論用戶

如何用python爬取壹本書的評論用戶

京東圖書評論有非常豐富的信息,這裏面就包含了購買日期、書名、作者、好評、中評、差評等等。以購買日期為例,使用Python + Mysql的搭配進行實現,程序不大,才100行。相關的解釋我都在程序裏加註了:

from?selenium?import?webdriver

from?bs4?import?BeautifulSoup

import?re

import?win32com.client

import?threading,time

import?MySQLdb

def?mydebug():

driver.quit()

exit(0)

def?catchDate(s):

"""頁面數據提取"""

soup?=?BeautifulSoup(s)

z?=?[]

global?nowtimes

m?=?soup.findAll("div",class_="date-buy")

for?obj?in?m:

try:

tmp?=?obj.find('br').contents

except?Exception,?e:

continue

if(tmp?!=?""):

z.append(tmp)

nowtimes?+=?1

return?z

def?getTimes(n,t):

"""獲取當前進度"""

return?"當前進度為:"?+?str(int(100*n/t))?+?"%"

#———————————————————————————————————|?程序開始?|—————————————————————————————————

#確定圖書大類

cate?=?{"3273":"歷史","3279":"心理學","3276":"政治軍事","3275":"國學古籍","3274":"哲學宗教","3277":"法律","3280":"文化","3281":"社會科學"}

#斷點續抓

num1?=?input("bookid:")

num2?=?input("pagenumber:")

#生成圖書大類鏈接,***需17355*20?=?347100次

totaltimes?=?347100.0

nowtimes?=?0

#開啟webdirver的PhantomJS對象

#driver?=?webdriver.PhantomJS()

driver?=?webdriver.Ie('C:\Python27\Scripts\IEDriverServer')

#driver?=?webdriver.Chrome('C:\Python27\Scripts\chromedriver')

#讀出Mysql中的評論頁面,進行抓取

# 連接數據庫 

try:

conn?=?MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')

except?Exception,?e:

print?e

sys.exit()

# 獲取cursor對象

cursor?=?conn.cursor()

sql?=?"SELECT * FROM booknew ORDER BY pagenumber DESC"

cursor.execute(sql)

alldata?=?cursor.fetchall()

flag?=?0

flag2?=?0

# 如果有數據返回就循環輸出,htt/review/10178500-1-154.html

if?alldata:

for?rec?in?alldata:

#rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber

if(rec[0]?!=?str(num1)?and?flag?==?0):

continue

else:

flag?=?1

for?p?in?range(num2,rec[2]):

if(flag2?==?0):

num2?=?0

flag2?=?1

p?+=?1

link?=?"htteview/"?+?rec[0]?+?"-1-"?+?str(p)?+?".html"

#抓網頁

driver.get(link)

html?=?driver.page_source

#抓評論

buydate?=?catchDate(html)

#寫入數據庫

for?z?in?buydate:

sql?=?"INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, '"?+?rec[0]?+?"','"?+?rec[1]?+?"','"?+?z[0]?+?"');"

try:

cursor.execute(sql)

except?Exception,?e:

print?e

conn.commit()

print?getTimes(nowtimes,totaltimes)

driver.quit()

cursor.close()

conn.close()

京東圖書評論有非常豐富的信息,這裏面就包含了購買日期、書名、作者、好評、中評、差評等等。以購買日期為例,使用Python + Mysql的搭配進行實現,程序不大,才100行。相關的解釋我都在程序裏加註了:

from?selenium?import?webdriver

from?bs4?import?BeautifulSoup

import?re

import?win32com.client

import?threading,time

import?MySQLdb

def?mydebug():

driver.quit()

exit(0)

def?catchDate(s):

"""頁面數據提取"""

soup?=?BeautifulSoup(s)

z?=?[]

global?nowtimes

m?=?soup.findAll("div",class_="date-buy")

for?obj?in?m:

try:

tmp?=?obj.find('br').contents

except?Exception,?e:

continue

if(tmp?!=?""):

z.append(tmp)

nowtimes?+=?1

return?z

def?getTimes(n,t):

"""獲取當前進度"""

return?"當前進度為:"?+?str(int(100*n/t))?+?"%"

#———————————————————————————————————|?程序開始?|—————————————————————————————————

#確定圖書大類

cate?=?{"3273":"歷史","3279":"心理學","3276":"政治軍事","3275":"國學古籍","3274":"哲學宗教","3277":"法律","3280":"文化","3281":"社會科學"}

#斷點續抓

num1?=?input("bookid:")

num2?=?input("pagenumber:")

#生成圖書大類鏈接,***需17355*20?=?347100次

totaltimes?=?347100.0

nowtimes?=?0

#開啟webdirver的PhantomJS對象

#driver?=?webdriver.PhantomJS()

driver?=?webdriver.Ie('C:\Python27\Scripts\IEDriverServer')

#driver?=?webdriver.Chrome('C:\Python27\Scripts\chromedriver')

#讀出Mysql中的評論頁面,進行抓取

# 連接數據庫 

try:

conn?=?MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')

except?Exception,?e:

print?e

sys.exit()

# 獲取cursor對象

cursor?=?conn.cursor()

sql?=?"SELECT * FROM booknew ORDER BY pagenumber DESC"

cursor.execute(sql)

alldata?=?cursor.fetchall()

flag?=?0

flag2?=?0

# 如果有數據返回就循環輸出,httreview/10178500-1-154.html

if?alldata:

for?rec?in?alldata:

#rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber

if(rec[0]?!=?str(num1)?and?flag?==?0):

continue

else:

flag?=?1

for?p?in?range(num2,rec[2]):

if(flag2?==?0):

num2?=?0

flag2?=?1

p?+=?1

link?=?"ht.com/review/"?+?rec[0]?+?"-1-"?+?str(p)?+?".html"

#抓網頁

driver.get(link)

html?=?driver.page_source

#抓評論

buydate?=?catchDate(html)

#寫入數據庫

for?z?in?buydate:

sql?=?"INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, '"?+?rec[0]?+?"','"?+?rec[1]?+?"','"?+?z[0]?+?"');"

try:

cursor.execute(sql)

except?Exception,?e:

print?e

conn.commit()

print?getTimes(nowtimes,totaltimes)

driver.quit()

cursor.close()

conn.close()