文章目录 [隐藏]
本例环境:Macos、Python3。
本例仅刷PV,目标网页仅浏览。
使用代理,避免被目标网站屏蔽。
黑产,灰产,群控,刷量
除了以下方法/库(也就是爬虫的第一步),其实还有其它很多:https://www.zhihu.com/question/60280580 ,常用的如 Scrapy。
方法1.使用urllib.request
1.1 示例代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import time import random import HeaderManager import BrushUrlManager import urllib.request as req from bs4 import BeautifulSoup def getProxyList(): """ 准备一些代理服务器 来源 https://www.xicidaili.com/nt """ header = random.choice(HeaderManager.headerList) header = header[12:] options = req.Request("https://www.xicidaili.com/nt", headers={"User-Agent": header}) # 1.请求地址 及请求头-字典{"":""} resp = req.urlopen(options) # 2.直接访问 html = resp.read() # 3.拿到响应页面 soup = BeautifulSoup(html, "lxml") ips = soup.findAll("tr") proxyList = [] for x in range(1, len(ips)): ip = ips[x] tds = ip.findAll("td") proxy_protocol = str(tds[5].contents[0]).lower() proxy_host = tds[1].contents[0] proxy_port = tds[2].contents[0] host = proxy_protocol + "://" + proxy_host + ":" + proxy_port proxy_host = {proxy_protocol: host} proxyList.append(proxy_host) # print("代理地址:%s" % host) return proxyList proxyList = getProxyList() def processing(): targetURl = random.choice(BrushUrlManager.urlTargetList) print("刷网址:%s" % targetURl) header = random.choice(HeaderManager.headerList) header = header[12:] proxy = random.choice(proxyList) print("代理:%s" % proxy) try: reqOptions = req.Request(targetURl, headers={"User-Agent": header}) # 1.创建一个请求体,包括要请求的目标地址和请求头 proxyHandler = req.ProxyHandler(proxy) # 2.创建一个代理器,代理是一个字典 如 {"https": "121.237.148.182:3000"} opener = req.build_opener(proxyHandler) # 3.创建一个访问对象,参数是代理器 resp = opener.open(reqOptions) # 4.访问对象打开连接(请求体),得到响应体 page = resp.read() # 5.得到响应数据 html = page.decode("utf-8") soup = BeautifulSoup(html, "lxml") title = soup.find("title").string print(time.strftime("%H:%M:%S") + ":" + title) except Exception as e: proxyList.remove(proxy) print(e) print() while True: # todo 根据需要设置停止条件 processing() time.sleep(random.uniform(1, 5)) |
1.2 代理IP测试
有很多的检测ip的网站,比如 https://tool.lu/ip/ ,打开后会看到本机在外网的ip。这个真实的ip需要用代理隐藏。
很多公开的代理质量没有保证,不能实现本机真实ip的隐藏,这就有必要测试一下。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import time import urllib.request as req from bs4 import BeautifulSoup header = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" proxyIp = {"https": "121.237.148.182:3000"} # 使用http协议的代理 targetURl = "https://tool.lu/ip/" # 检验发出请求ip是否为代理ip的网址 try: reqOptions = req.Request(targetURl, headers={'User-Agent': header}) proxyHandler = req.ProxyHandler(proxyIp) opener = req.build_opener(proxyHandler) resp = opener.open(reqOptions) time.sleep(5) html = resp.read() soup = BeautifulSoup(html, "lxml") inputs = soup.findAll("input") ip = inputs[1]["value"] print("使用代理后的IP:%s" % ip) # 如果这里打印出的是121.237.148.182,则这个代理可用 except Exception as e: print(e) |
方法2.使用requests
requests是第三方模块,比原生urllib.request简单一些。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import time import random import HeaderManager import BrushUrlManager import requests from bs4 import BeautifulSoup """ requests是第三方库,如下指令安装 pip3 install requests """ def getProxyList(): """ 获取可用的代理 """ header = random.choice(HeaderManager.headerList) header = header[12:] response = requests.get("https://www.xicidaili.com/nt", headers={"User-Agent": header}) # get请求(目标url,请求头) html = response.text # 直接从返回体拿页面 soup = BeautifulSoup(html, "lxml") ips = soup.findAll("tr") proxyList = [] for x in range(1, len(ips)): ip = ips[x] tds = ip.findAll("td") proxy_protocol = str(tds[5].contents[0]).lower() proxy_host = tds[1].contents[0] proxy_port = tds[2].contents[0] host = proxy_protocol + "://" + proxy_host + ":" + proxy_port proxy_host = {proxy_protocol: host} proxyList.append(proxy_host) # print("代理地址:%s" % host) return proxyList proxyList = getProxyList() def processing(): targetURl = random.choice(BrushUrlManager.urlTargetList) print("刷网址:%s" % targetURl) header = random.choice(HeaderManager.headerList) header = header[12:] proxy = random.choice(proxyList) print("代理:%s" % proxy) try: response = requests.get(targetURl, headers={"User-Agent": header}, proxies=proxy) # get请求(目标url,请求头,代理) response.encoding = "utf-8" # 设置为页面的charset编码,避免中文乱码 html = response.text # 直接从返回体拿页面 soup = BeautifulSoup(html, "lxml") title = soup.find("title").string print(time.strftime("%H:%M:%S") + ":" + title) except Exception as e: proxyList.remove(proxy) print(e) print() while True: processing() time.sleep(random.uniform(1, 5)) |
方法3.使用selenium
selenium 也是第三方库。使用它还需要根据用到的浏览器设置驱动。
动态加载的页面。如 https://voice.baidu.com/act/newpneumonia/newpneumonia ,各个人数的数字都是动态加载的,使用上面两种方式得到的页面都是空白。使用 selenium 结合浏览器可以有效访问到。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import random import time from bs4 import BeautifulSoup from selenium.webdriver import Chrome, ChromeOptions import HeaderManager import BrushUrlManager """ 1.如下指令安装: pip3 install selenium 2.本例结合Chrome浏览器,所以要根据当前浏览器版本下载Chrome的驱动,地址: http://npm.taobao.org/mirrors/chromedriver 针对Mac系统下载得到 chromedriver 驱动文件,直接将其放入一个环境变量目录里(或新配置) 如 $JAVA_HOME/bin 下。 如果不放到环境变量目录里,则须要在代码中指定这个驱动文件的路径。 """ def getProxyList(): """ 获取可用的代理 """ option = ChromeOptions() # 1.配置参数 option.add_argument("--headless") # 无头模式 option.add_argument("--no-sandbox") option.add_argument("--disable-gpu") option.add_argument("disable-infobars") browser = Chrome(options=option) # 2.创建浏览器 browser.get("https://www.xicidaili.com/nt") # 3.请求连接 html = browser.page_source # 拿到页面 browser.close() browser.quit() soup = BeautifulSoup(html, "lxml") ips = soup.findAll("tr") proxyList = [] for x in range(1, len(ips)): ip = ips[x] tds = ip.findAll("td") proxy_protocol = str(tds[5].contents[0]).lower() proxy_host = tds[1].contents[0] proxy_port = tds[2].contents[0] host = proxy_protocol + "://" + proxy_host + ":" + proxy_port proxyList.append(host) # 注意本例 这里是字符串 return proxyList def processing(): """ 刷量 """ proxyList = getProxyList() # 代理ip池 while True: proxy = random.choice(proxyList) print("代理:" + proxy) header = random.choice(HeaderManager.headerList) # print("头部:" + header) urlTarget = random.choice(BrushUrlManager.urlTargetList) print("目标:" + urlTarget) try: option = ChromeOptions() option.add_argument("--headless") option.add_argument("--no-sandbox") option.add_argument("--disable-gpu") option.add_argument("disable-infobars") option.add_argument("--proxy-server=" + proxy) # 设置代理 如 "http://www.baidu.com" option.add_argument(header) option.add_experimental_option('excludeSwitches', ['enable-automation']) # browser = Chrome(executable_path="chromedriver驱动路径", options=option) browser = Chrome(options=option) browser.get(urlTarget) time.sleep(random.uniform(5, 10)) title = browser.title print("成功访问页面:" + title) time.sleep(random.uniform(1, 5)) # browser.implicitly_wait(3)#等待3秒 # js = "window.scrollTo(0, document.body.scrollHeight/3)" # browser.execute_script(js) # time.sleep(random.uniform(1, 3)) # js = "window.scrollTo(0, document.body.scrollHeight)" # browser.execute_script(js) # time.sleep(random.uniform(1, 3)) # js = "window.scrollTo(0, -document.body.scrollHeight/3*2)" # browser.execute_script(js) # time.sleep(random.uniform(3, 10)) browser.close() # browser.quit() # time.sleep(random.uniform(5, 10)) except Exception as e: print("Exception:") print(e) print(e.__traceback__.tb_lineno) proxyList.remove(proxy) time.sleep(random.uniform(0, 5)) processing() |
selenium是一个测试库,主要用于自动化测试。
HeaderManager 和 BrushUrlManager
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# HeaderManager.py,设置不同的header header01 = "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" header02 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" header03 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)" header04 = "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" header05 = "User-Agent: Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" header06 = "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)" header07 = "User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;" header08 = "User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)" header09 = "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0 " header10 = "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" header11 = "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0" header12 = "User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)" header13 = "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)" header14 = "User-Agent: Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11" header15 = "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" header16 = "User-Agent: Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19" header17 = "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2" header18 = "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1" header19 = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" header20 = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" header21 = "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11" header22 = "User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" header23 = "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" header24 = "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)" header25 = "User-Agent: Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" headerList = [header01, header02, header03, header04, header05, header06, header07, header08, header09, header10, header11, header12, header13, header14, header15, header16, header17, header18, header19, header20, header21, header22, header23, header24, header25] # BrushUrlManager.py 目标页面 urlt1 = "https://www.jianshu.com/p/e53ffeebf7c0" urlt2 = "https://www.jianshu.com/p/32eb96926e9b" urlt3 = "https://www.jianshu.com/p/5ed7394bde78" urlt4 = "https://www.jianshu.com/p/54b2271aad80" urlt5 = "https://www.jianshu.com/p/58904911ce9e" urlt6 = "https://www.jianshu.com/p/566dead0ff6d" urlt7 = "https://www.jianshu.com/p/124234fd1924" urlt8 = "https://www.jianshu.com/p/0b0b6c339d44" urlt9 = "https://www.jianshu.com/p/54cca148be62" urlTargetList = [urlt1, urlt2, urlt3, urlt4, urlt5, urlt6, urlt7, urlt8, urlt9] |
- end
声明
本文由崔维友 威格灵 cuiweiyou vigiles cuiweiyou 原创,转载请注明出处:http://www.gaohaiyan.com/2499.html
承接App定制、企业web站点、办公系统软件 设计开发,外包项目,毕设