# 用正则爬取网页数据, 并存储到本地pymysql数据库中
import re,
random, time, pymysql
from urllib.request import Request, urlopen
class QSBKDataTool(object):
# [('\n猩猩眨呀眨\n', '24', '\n\n\n昨晚同学聚会,以前的死对头非要坐我旁边,盯着我脸上的痘痘说她现在不吃肉,皮肤变好了。<br/>我放下筷子盯着她的水桶腰,说我只吃一点点肉,但不吃主食,所以体重控制的不错……<br/>聚会结束各自离去,也许是冤家路窄,我俩在牛肉面馆里又遇到了。\n\n', '5785', '43')]
remove_n = repile(r'\n', re.S)
remove_br = repile(r'<br/>|<br>', re.S)
@classmethod
def process_data(cls, origin_data):
result_data = []
for data in origin_data:
# 处理昵称 data[0]
nick_name = data[0]
nick_name = re.sub(cls.remove_n, '', nick_name) # str字符串中的replace()
# 处理内容 data[3]
content = data[3]
content = re.sub(cls.remove_n, '', content)
content = re.sub(cls.remove_br, '', content)
result_data.append((nick_name, data[1], data[2], content, data[4], data[5]))
return result_data
@classmethod
def process_next(cls, data):
next_page_str = data[0][1]
next_page_str = re.sub(cls.remove_n, '', next_page_str)
return (data[0][0], next_page_str)
class QSBKDBTool(object):
db = None
cursor = None
@classmethod
def connect_db(cls):
cls.db = pymysql.connect(host='localhost', user='root', passwd='123456', db='qsbk', port=3306, charset='utf8')
cls.cursor = cls.db.cursor()
@classmethod
def save_list_data(cls, list_data):
# 遍历list_data,执行insert操作
for q_name, q_age, q_href, q_content, q_smail_num, q_comment_num in list_data:
# 表使用文章的id作为主键 /article/120510346
q_id = q_href.split('/')[2]
insert_sql = "INSERT INTO qsbk (`q_id`, `q_name`, `q_age`, `q_href`, `q_content`, `q_smail_num`, q_comment_num ) VALUES (%s, %s, %s, %s, %s, %s, %s)"
try:
cls.cursor.execute(insert_sql, (q_id, q_name, q_age, q_href, q_content, q_smail_num, q_comment_num ))
cls.dbmit()
except Exception as e:
print('主键冲突或者内容有表情数据,跳过...')
cls.db.rollback()
@classmethod
def save_detail_data(cls,q_id,detail_data):
if detail_data:
for comment in detail_data:
insert_sql = "INSERT INTO detail (comment, q_id) VALUES (%s,%s)"
try:
cls.cursor.execute(insert_sql, (comment, q_id))
cls.dbmit()
except Exception as e:
print('详情页主键冲突或者内容有表情数据,跳过...',e)
cls.db.rollback()
@classmethod
def connect_close(cls):
cls.cursor.close()
cls.db.close()
class QSBKDetailSpider(object):
"""
解析详情页
"""
user_agent_list = [
"User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
]
def __init__(self, url):
self.url = url
def get_page_detail(self):
user_agent = random.choice(self.user_agent_list)
request = Request(self.url, headers={'User-Agent': user_agent})
# 源代码中可能含有表情
try:
response = urlopen(request)
# 表情在decode()的时候,有可能会成功,但是存入数据库失败
# 表情在decode()的时候,直接异常
try:
origtin_html = response.read().decode()
except Exception as e:
print('decode()失败,原因:{},url:{}'.format(e, self.url))
# 这一页源代码获取失败
return None
except Exception as e:
print('urlopen()失败,原因:{}, url:{}'.format(e, self.url))
return None
else:
return origtin_html
def parse_page_detail(self, origin_html):
if origin_html != None:
comment = re.findall(repile(r'<a.*?class="userlogin".*?<span class="body">(.*?)</span>', re.S), origin_html)
# pop()返回被移除的对象
comment.pop(len(comment)-1)
return comment
else:
print('详情页源代码为空')
class QSBKSpider(object):
user_agent_list = [
"User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
]
def __init__(self):
self.base_url = 'https://www.qiushibaike/hot/page/'
def get_page_list(self, page_num):
"""
获取列表页数据
:param page_num: 页码
:return: 数据
"""
url = self.base_url + str(page_num)
user_agent = random.choice(self.user_agent_list)
request = Request(url, headers={'User-Agent': user_agent})
# 源代码中可能含有表情
try:
response = urlopen(request)
# 表情在decode()的时候,有可能会成功,但是存入数据库失败
# 表情在decode()的时候,直接异常
try:
origtin_html = response.read().decode()
except Exception as e:
print('decode()失败,原因:{},url:{}'.format(e, url))
# 这一页源代码获取失败
return None
except Exception as e:
print('urlopen()失败,原因:{}, url:{}'.format(e, url))
return None
else:
return origtin_html
def parser_page_list(self, origin_html):
"""
解析列表页的数据
:param origin_html: 某一页的网页源代码
:return: 解析并处理后的数据
"""
if origin_html != None:
pattern = repile(r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?">(.*?)</div>.*?<a.*?href="(.*?)".*?>.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(.*?)</i>.*?<i class="number">(.*?)</i>', re.S)
origin_data = re.findall(pattern, origin_html)
result_data = QSBKDataTool.process_data(origin_data)
# 将列表页的resul_data保存到数据库
QSBKDBTool.save_list_data(result_data)
# 处理详情页数据
self.get_detail_url(result_data)
# 处理下一页
next_page_pattern = repile(r'.*<span class="page-numbers">.*?<a href="(.*?)".*?>.*?<span.*?>(.*?)</span>', re.S)
res = re.findall(next_page_pattern, origin_html)
# 判断是否有下一页,如果有,继续上述的逻辑;如果没有,停止爬虫
next_data = QSBKDataTool.process_next(res)
if next_data[1] == '下一页':
relation_url = next_data[0]
number = re.search('(\d+)', relation_url).group()
html = self.get_page_list(number)
self.parser_page_list(html)
time.sleep(3)
else:
print('已经是最后一页了')
else:
print('origin_html为None')
def get_detail_url(self, data):
for data_tuple in data:
detail_url = 'https://www.qiushibaike' + data_tuple[2]
q_id = data_tuple[2].split('/')[2]
detail_spider = QSBKDetailSpider(detail_url)
detail_html = detail_spider.get_page_detail()
res = detail_spider.parse_page_detail(detail_html)
QSBKDBTool.save_detail_data(q_id, res)
if __name__ == "__main__":
QSBKDBTool.connect_db()
qsbk = QSBKSpider()
origin_html = qsbk.get_page_list(1)
qsbk.parser_page_list(origin_html)
QSBKDBTool.connect_close()
# 用正则爬取网页数据, 并存储到本地pymysql数据库中
import re,
random, time, pymysql
from urllib.request import Request, urlopen
class QSBKDataTool(object):
# [('\n猩猩眨呀眨\n', '24', '\n\n\n昨晚同学聚会,以前的死对头非要坐我旁边,盯着我脸上的痘痘说她现在不吃肉,皮肤变好了。<br/>我放下筷子盯着她的水桶腰,说我只吃一点点肉,但不吃主食,所以体重控制的不错……<br/>聚会结束各自离去,也许是冤家路窄,我俩在牛肉面馆里又遇到了。\n\n', '5785', '43')]
remove_n = repile(r'\n', re.S)
remove_br = repile(r'<br/>|<br>', re.S)
@classmethod
def process_data(cls, origin_data):
result_data = []
for data in origin_data:
# 处理昵称 data[0]
nick_name = data[0]
nick_name = re.sub(cls.remove_n, '', nick_name) # str字符串中的replace()
# 处理内容 data[3]
content = data[3]
content = re.sub(cls.remove_n, '', content)
content = re.sub(cls.remove_br, '', content)
result_data.append((nick_name, data[1], data[2], content, data[4], data[5]))
return result_data
@classmethod
def process_next(cls, data):
next_page_str = data[0][1]
next_page_str = re.sub(cls.remove_n, '', next_page_str)
return (data[0][0], next_page_str)
class QSBKDBTool(object):
db = None
cursor = None
@classmethod
def connect_db(cls):
cls.db = pymysql.connect(host='localhost', user='root', passwd='123456', db='qsbk', port=3306, charset='utf8')
cls.cursor = cls.db.cursor()
@classmethod
def save_list_data(cls, list_data):
# 遍历list_data,执行insert操作
for q_name, q_age, q_href, q_content, q_smail_num, q_comment_num in list_data:
# 表使用文章的id作为主键 /article/120510346
q_id = q_href.split('/')[2]
insert_sql = "INSERT INTO qsbk (`q_id`, `q_name`, `q_age`, `q_href`, `q_content`, `q_smail_num`, q_comment_num ) VALUES (%s, %s, %s, %s, %s, %s, %s)"
try:
cls.cursor.execute(insert_sql, (q_id, q_name, q_age, q_href, q_content, q_smail_num, q_comment_num ))
cls.dbmit()
except Exception as e:
print('主键冲突或者内容有表情数据,跳过...')
cls.db.rollback()
@classmethod
def save_detail_data(cls,q_id,detail_data):
if detail_data:
for comment in detail_data:
insert_sql = "INSERT INTO detail (comment, q_id) VALUES (%s,%s)"
try:
cls.cursor.execute(insert_sql, (comment, q_id))
cls.dbmit()
except Exception as e:
print('详情页主键冲突或者内容有表情数据,跳过...',e)
cls.db.rollback()
@classmethod
def connect_close(cls):
cls.cursor.close()
cls.db.close()
class QSBKDetailSpider(object):
"""
解析详情页
"""
user_agent_list = [
"User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
]
def __init__(self, url):
self.url = url
def get_page_detail(self):
user_agent = random.choice(self.user_agent_list)
request = Request(self.url, headers={'User-Agent': user_agent})
# 源代码中可能含有表情
try:
response = urlopen(request)
# 表情在decode()的时候,有可能会成功,但是存入数据库失败
# 表情在decode()的时候,直接异常
try:
origtin_html = response.read().decode()
except Exception as e:
print('decode()失败,原因:{},url:{}'.format(e, self.url))
# 这一页源代码获取失败
return None
except Exception as e:
print('urlopen()失败,原因:{}, url:{}'.format(e, self.url))
return None
else:
return origtin_html
def parse_page_detail(self, origin_html):
if origin_html != None:
comment = re.findall(repile(r'<a.*?class="userlogin".*?<span class="body">(.*?)</span>', re.S), origin_html)
# pop()返回被移除的对象
comment.pop(len(comment)-1)
return comment
else:
print('详情页源代码为空')
class QSBKSpider(object):
user_agent_list = [
"User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
]
def __init__(self):
self.base_url = 'https://www.qiushibaike/hot/page/'
def get_page_list(self, page_num):
"""
获取列表页数据
:param page_num: 页码
:return: 数据
"""
url = self.base_url + str(page_num)
user_agent = random.choice(self.user_agent_list)
request = Request(url, headers={'User-Agent': user_agent})
# 源代码中可能含有表情
try:
response = urlopen(request)
# 表情在decode()的时候,有可能会成功,但是存入数据库失败
# 表情在decode()的时候,直接异常
try:
origtin_html = response.read().decode()
except Exception as e:
print('decode()失败,原因:{},url:{}'.format(e, url))
# 这一页源代码获取失败
return None
except Exception as e:
print('urlopen()失败,原因:{}, url:{}'.format(e, url))
return None
else:
return origtin_html
def parser_page_list(self, origin_html):
"""
解析列表页的数据
:param origin_html: 某一页的网页源代码
:return: 解析并处理后的数据
"""
if origin_html != None:
pattern = repile(r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?">(.*?)</div>.*?<a.*?href="(.*?)".*?>.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(.*?)</i>.*?<i class="number">(.*?)</i>', re.S)
origin_data = re.findall(pattern, origin_html)
result_data = QSBKDataTool.process_data(origin_data)
# 将列表页的resul_data保存到数据库
QSBKDBTool.save_list_data(result_data)
# 处理详情页数据
self.get_detail_url(result_data)
# 处理下一页
next_page_pattern = repile(r'.*<span class="page-numbers">.*?<a href="(.*?)".*?>.*?<span.*?>(.*?)</span>', re.S)
res = re.findall(next_page_pattern, origin_html)
# 判断是否有下一页,如果有,继续上述的逻辑;如果没有,停止爬虫
next_data = QSBKDataTool.process_next(res)
if next_data[1] == '下一页':
relation_url = next_data[0]
number = re.search('(\d+)', relation_url).group()
html = self.get_page_list(number)
self.parser_page_list(html)
time.sleep(3)
else:
print('已经是最后一页了')
else:
print('origin_html为None')
def get_detail_url(self, data):
for data_tuple in data:
detail_url = 'https://www.qiushibaike' + data_tuple[2]
q_id = data_tuple[2].split('/')[2]
detail_spider = QSBKDetailSpider(detail_url)
detail_html = detail_spider.get_page_detail()
res = detail_spider.parse_page_detail(detail_html)
QSBKDBTool.save_detail_data(q_id, res)
if __name__ == "__main__":
QSBKDBTool.connect_db()
qsbk = QSBKSpider()
origin_html = qsbk.get_page_list(1)
qsbk.parser_page_list(origin_html)
QSBKDBTool.connect_close()