声明:本文仅是为了学习而举例说明python的强大,禁止用于不良目的!
1、python可以打开浏览器并浏览网页,并且保存网页内容到本地硬盘
实现代码如下:
import urllib
import webBrowe as web # web是别名
url="http://www.jd"
content=urllib.urlopen(url).read()
open('data.html','w').write(content)
#打开刚才写入的文件data.html
web.open_new_tab("data.html");
2、能够调用操作系统的命令关闭浏览器
window命令是:taskkill /F/IM 应用名称 ,如 taskkill /F /IM qq.exe 就关闭了qq
linux 命令是:killall(kill不建议使用) /F /IM qq.exe
python实现代码如下:
import os
os.system('taskkill /F /IM qq.exe')
#linux中:os.system('killall /F /IM qq.exe')
3、实现打开网页?次和关闭网页?次,以及打开?次网页后才关闭网页
python实现代码打开10次网页后关闭一次,一下实现打开最少(10*count)次:
import webBrowe as web
import time
import os
import urllib
import random
#产生随机数范围:[1,9)
count=random.randint(1,10)
#定义变量控制循环结束
j=0
while j<=count :
#定义第count次打开次数
i=0
#打开浏览器的控制
while i<=9 :
#打开浏览器
web.open_new_tab("需要打开的地址")
#控制循环次数
i+=1
#留给浏览器反应时间:0.8s
time.sleep(0.8)
else :
#杀死程序,我使用的是360浏览器打开
os.system('taskkill /F /IM 360se.exe')
#控制外层循环
j+=1
注意:本文举例是基于python 2.7版本,开发工具使用pycharm,
如果是python3.0以上版本可能不支持,部分方法需要稍微修改
其他一:参考
http://justcoding.iteye/blog/1940717
http://www.open-open/lib/view/open1419163083058.html
https://www.douban/note/572528169/
在linux下爬取网页信息的步骤
1、安装wget命令:yum install -y wget
2、执行命令
#wget -o /tmp/wget.log -P /opt/testdata --no-parent --no-verbose -m -D mydata -N --convert-links --random-wait -A html,HTML,JSP http://www.***
#wget -r -np -d -o /itcast --accept=iso,html,HTML,ASP,asp http://www.itcast/
3、追踪爬取的日志
#tail -F /tmp/wget.log
4、成功下载后,压缩文件
#yum -y install zip
#zip -r mydata.zip mydata
其他二:爬取某站内容(注意缩进)
#--coding:utf-8--
from urlparse import urlparse, urljoin
from os.path import splitext, dirname, isdir, exists
from os import sep, unlink , makedirs
from string import replace, find , lower
from urllib import urlretrieve
from htmllib import HTMLParser
from formatter import AbstractFormatter, DumbWriter
from cStringIO import StringIO
class Retriever(object): # download web pages
def __init__(self, url):
self.url = url
self.file = self.filename(url)
def filename(self, url, deffile='index.html'):
parsedurl = urlparse(url, 'http:', 0) # parse path
path = parsedurl[1] + parsedurl[2]
print path
ext = splitext(path)
print ext
if ext[1] == '': # no file, use default
if path[-1] == '/':
path += deffile
else:
path += '/' + deffile
ldir = dirname(path) # local directory
print path
print ldir
if sep != '/': # os-indep. path separator
ldir = replace(ldir, '/', sep)
if not isdir(ldir): # create archive dir if nec.
if exists(ldir): unlink(ldir)
makedirs(ldir)
return path
def download(self): # download web page
try:
retval = urlretrieve(self.url, self.file)
except IOError:
retval = ('*** ERROR: invalid URL "%s"' % self.url)
return retval
def parseAndGetLinks(self): # parse HTML, save links
self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parser.close()
return self.parser.anchorlist
class Crawler(object): # manage entrie crawling process
count = 0 # static downloaded page counter
def __init__(self, url):
self.q = [url]
self.seen = []
self.dom = urlparse(url)[1]
print 'self.dom: ', self.dom
def getPage(self, url):
r = Retriever(url)
retval = r.download()
if retval[0] == '*': # error situation, do not parse 对于上面54行的错误字符串
print retval, '... skipping parse'
return
Crawler.count += 1
print '\n(', Crawler.count,')'
print 'URL:', url
print 'FILE:', retval[0]
self.seen.append(url)
links = r.parseAndGetLinks() # get and process links
for eachLink in links:
if eachLink[:4] != 'http' and find(eachLink, '://') == -1:
eachLink = urljoin(url, eachLink)
print '* ', eachLink
# 如果发现有邮箱地址连接
if find(lower(eachLink), 'mailto:') != -1:
print '... discarded, mailto link'
continue
if eachLink not in self.seen:
if find(eachLink, self.dom) == -1:
print '... discarded, not in domain'
else:
if eachLink not in self.q:
self.q.append(eachLink)
print '... new, added to Q'
else:
print '... discarded, already in Q'
def go(self): # process links in queue
while self.q:
url = self.q.pop()
self.getPage(url)
def main():
try:
url = raw_input('Enter starting URL: ')
except(KeyboardInterrupt, EOFError):
url = ''
if not url: return
# robot = Crawler('http://baike.bd/subview/2202550/11243904.htm')
robot = Crawler(url)
robot.go()
print 'Done!'
if __name__ == '__main__':
main()
编写shell脚本:
#!/bin/sh
URL="$2"
PATH="$1"
echo "download url: $URL"
echo "download dir: $PATH"
/usr/bin/wget -e robots=off -w 1 -xq -np -nH -pk -m -t 1 -P "$PATH" "$URL"
echo "success to download"
-x 创建镜像网站对应的目录结构
-q 静默下载,即不显示下载信息,你如果想知道wget当前在下载什么资源的话,可以去掉这个选项
-m 它会打开镜像相关的选项,比如无限深度的子目录递归下载。
-t times 某个资源下载失败后的重试下载次数
-w seconds 资源请求下载之间的等待时间(减轻服务器的压力)
声明:本文仅是为了学习而举例说明python的强大,禁止用于不良目的!
1、python可以打开浏览器并浏览网页,并且保存网页内容到本地硬盘
实现代码如下:
import urllib
import webBrowe as web # web是别名
url="http://www.jd"
content=urllib.urlopen(url).read()
open('data.html','w').write(content)
#打开刚才写入的文件data.html
web.open_new_tab("data.html");
2、能够调用操作系统的命令关闭浏览器
window命令是:taskkill /F/IM 应用名称 ,如 taskkill /F /IM qq.exe 就关闭了qq
linux 命令是:killall(kill不建议使用) /F /IM qq.exe
python实现代码如下:
import os
os.system('taskkill /F /IM qq.exe')
#linux中:os.system('killall /F /IM qq.exe')
3、实现打开网页?次和关闭网页?次,以及打开?次网页后才关闭网页
python实现代码打开10次网页后关闭一次,一下实现打开最少(10*count)次:
import webBrowe as web
import time
import os
import urllib
import random
#产生随机数范围:[1,9)
count=random.randint(1,10)
#定义变量控制循环结束
j=0
while j<=count :
#定义第count次打开次数
i=0
#打开浏览器的控制
while i<=9 :
#打开浏览器
web.open_new_tab("需要打开的地址")
#控制循环次数
i+=1
#留给浏览器反应时间:0.8s
time.sleep(0.8)
else :
#杀死程序,我使用的是360浏览器打开
os.system('taskkill /F /IM 360se.exe')
#控制外层循环
j+=1
注意:本文举例是基于python 2.7版本,开发工具使用pycharm,
如果是python3.0以上版本可能不支持,部分方法需要稍微修改
其他一:参考
http://justcoding.iteye/blog/1940717
http://www.open-open/lib/view/open1419163083058.html
https://www.douban/note/572528169/
在linux下爬取网页信息的步骤
1、安装wget命令:yum install -y wget
2、执行命令
#wget -o /tmp/wget.log -P /opt/testdata --no-parent --no-verbose -m -D mydata -N --convert-links --random-wait -A html,HTML,JSP http://www.***
#wget -r -np -d -o /itcast --accept=iso,html,HTML,ASP,asp http://www.itcast/
3、追踪爬取的日志
#tail -F /tmp/wget.log
4、成功下载后,压缩文件
#yum -y install zip
#zip -r mydata.zip mydata
其他二:爬取某站内容(注意缩进)
#--coding:utf-8--
from urlparse import urlparse, urljoin
from os.path import splitext, dirname, isdir, exists
from os import sep, unlink , makedirs
from string import replace, find , lower
from urllib import urlretrieve
from htmllib import HTMLParser
from formatter import AbstractFormatter, DumbWriter
from cStringIO import StringIO
class Retriever(object): # download web pages
def __init__(self, url):
self.url = url
self.file = self.filename(url)
def filename(self, url, deffile='index.html'):
parsedurl = urlparse(url, 'http:', 0) # parse path
path = parsedurl[1] + parsedurl[2]
print path
ext = splitext(path)
print ext
if ext[1] == '': # no file, use default
if path[-1] == '/':
path += deffile
else:
path += '/' + deffile
ldir = dirname(path) # local directory
print path
print ldir
if sep != '/': # os-indep. path separator
ldir = replace(ldir, '/', sep)
if not isdir(ldir): # create archive dir if nec.
if exists(ldir): unlink(ldir)
makedirs(ldir)
return path
def download(self): # download web page
try:
retval = urlretrieve(self.url, self.file)
except IOError:
retval = ('*** ERROR: invalid URL "%s"' % self.url)
return retval
def parseAndGetLinks(self): # parse HTML, save links
self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parser.close()
return self.parser.anchorlist
class Crawler(object): # manage entrie crawling process
count = 0 # static downloaded page counter
def __init__(self, url):
self.q = [url]
self.seen = []
self.dom = urlparse(url)[1]
print 'self.dom: ', self.dom
def getPage(self, url):
r = Retriever(url)
retval = r.download()
if retval[0] == '*': # error situation, do not parse 对于上面54行的错误字符串
print retval, '... skipping parse'
return
Crawler.count += 1
print '\n(', Crawler.count,')'
print 'URL:', url
print 'FILE:', retval[0]
self.seen.append(url)
links = r.parseAndGetLinks() # get and process links
for eachLink in links:
if eachLink[:4] != 'http' and find(eachLink, '://') == -1:
eachLink = urljoin(url, eachLink)
print '* ', eachLink
# 如果发现有邮箱地址连接
if find(lower(eachLink), 'mailto:') != -1:
print '... discarded, mailto link'
continue
if eachLink not in self.seen:
if find(eachLink, self.dom) == -1:
print '... discarded, not in domain'
else:
if eachLink not in self.q:
self.q.append(eachLink)
print '... new, added to Q'
else:
print '... discarded, already in Q'
def go(self): # process links in queue
while self.q:
url = self.q.pop()
self.getPage(url)
def main():
try:
url = raw_input('Enter starting URL: ')
except(KeyboardInterrupt, EOFError):
url = ''
if not url: return
# robot = Crawler('http://baike.bd/subview/2202550/11243904.htm')
robot = Crawler(url)
robot.go()
print 'Done!'
if __name__ == '__main__':
main()
编写shell脚本:
#!/bin/sh
URL="$2"
PATH="$1"
echo "download url: $URL"
echo "download dir: $PATH"
/usr/bin/wget -e robots=off -w 1 -xq -np -nH -pk -m -t 1 -P "$PATH" "$URL"
echo "success to download"
-x 创建镜像网站对应的目录结构
-q 静默下载,即不显示下载信息,你如果想知道wget当前在下载什么资源的话,可以去掉这个选项
-m 它会打开镜像相关的选项,比如无限深度的子目录递归下载。
-t times 某个资源下载失败后的重试下载次数
-w seconds 资源请求下载之间的等待时间(减轻服务器的压力)