Python学习:构建代理池

前言

在使用爬虫时,高并发的请求很容易造成IP封禁,这个时候就需要用到代理ip访问来进行绕过了。构建代理池这里我用了两种方式

  1. 第一种是github上的一个开源项目 proxy_pool
  2. 手动编写代码采集(只适合练手,推荐第一种)

一丶proxy_pool

安装步骤

1.下载

​git clone https://github.com/jhao104/proxy_pool.git

2.下载依赖库

pip install -r requirements.txt

3.配置环境

安装redis,这里我直接用小皮面板安装了

 

设置redis密码

配置setting.py文件

代理采集以及使用

python proxyPool.py schedule  # 采集可用代理ip

 开启web服务

python proxyPool.py server

完成后可访问ip:端口/get单条代理ip,ip:端口/all所有可用代理ip

二丶手动采集

1.所用到的库requests,threadpool,bs4,random,json,time(以防请求过快导致ip封禁)

2.定义一个随机请求头部函数,伪装还是必须得要的啦

def req_headers():
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59",
    ]
    user_agent = random.choice(ua_list)

    num = range(1, 255)
    ip_num1 = random.choice(num)
    ip_num2 = random.choice(num)
    ip_num3 = random.choice(num)
    ip_num4 = random.choice(num)

    headers = {
        'User-Agent': user_agent,  # 随机生成一个ua
        'X-Forward-for': "%d.%d.%d.%d" % (ip_num1, ip_num2, ip_num3, ip_num4) # 使用随机数生成随机来源ip
    }
    return headers  # 返回一个随机头部

3.代理ip查询

 

 

可以看到所有的数据都保存在了一个个的td标签中,在替换页数的时候url会随之变化,而我们需要的只有ip,port,类型将它裁剪出来然后放到文本中

def req(page):
    response = requests.get(f'https://www.kuaidaili.com/free/inha/{page}/', headers=req_headers(), verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    url_pools = soup.find_all('tr')
    for url_ip in url_pools:
        pools = url_ip.text[1:].split()
        # print(f"'{pools[3].lower()}': '{pools[0]}:{pools[1]}'")
        with open('proxy.txt', 'a+') as f:
            f.write("{\"%s': \"%s://%s:%s\"}" % (pools[3].lower(), pools[3].lower(), pools[0], pools[1]) + '\n')
            f.close()
    time.sleep(1)

 这里我没有加多线程跑,因为第一次跑被封ip了

4.代理ip测试

#  导入代理ip
def proxy():
    f = open('proxy.txt', 'r')
    proxy_pools = []
    for url in f.readlines():
        proxy_ip = url[:-1]
        proxy_ip = json.loads(proxy_ip)
        proxy_pools.append(proxy_ip)
    return proxy_pools

#  发送请求
def check_url(proxies):
    headers = req_headers()
    time.sleep(1)
    try:
        response = requests.get('http://www.baidu.com', proxies=proxies, headers=headers, timeout=5, allow_redirects=False)
        print('=' * 15 + proxies + '=' * 15)
        if response.status_code == 200:
            print("============代理地址IP可用============")
            with open('ok.txt', 'a+') as f:
                f.write(proxies + '\n')
        else:
            print('error')
    except Exception:
        pass

完整代码

import time
import requests
import json
import threadpool
from bs4 import BeautifulSoup
import random


# 生成随机头部
def req_headers():
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59",
    ]
    user_agent = random.choice(ua_list)

    num = range(1, 255)
    ip_num1 = random.choice(num)
    ip_num2 = random.choice(num)
    ip_num3 = random.choice(num)
    ip_num4 = random.choice(num)

    headers = {
        'User-Agent': user_agent,
        'X-Forward-for': "%d.%d.%d.%d" % (ip_num1, ip_num2, ip_num3, ip_num4)
    }
    return headers


def proxy():
    f = open('proxy.txt', 'r')
    proxy_pools = []
    for url in f.readlines():
        proxy_ip = url[:-1]
        proxy_ip = json.loads(proxy_ip)
        proxy_pools.append(proxy_ip)
    return proxy_pools


def check_url(proxies):
    headers = req_headers()
    time.sleep(1)
    try:
        response = requests.get('http://zhihuiy.top', proxies=proxies, headers=headers, timeout=5,
                                allow_redirects=False)
        print('=' * 15 + proxies + '=' * 15)
        if response.status_code == 200:
            print("============代理地址IP可用============")
            with open('ok.txt', 'a+') as f:
                f.write(proxies + '\n')
        else:
            print('error')
    except Exception:
        pass

def req(page):
    print(f"===============正在爬取第{page}页===============")
    response = requests.get(f'https://www.kuaidaili.com/free/inha/{page}/', headers=req_headers(), verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    url_pools = soup.find_all('tr')
    for url_ip in url_pools:
        pools = url_ip.text[1:].split()
        # print(f"'{pools[3].lower()}': '{pools[0]}:{pools[1]}'")
        with open('proxy.txt', 'a+') as f:
            f.write("{\"%s\": \"%s://%s:%s\"}" % (pools[3].lower(), pools[3].lower(), pools[0], pools[1]) + '\n')
            f.close()
    time.sleep(1)


def main():
    th_pools = threadpool.ThreadPool(50)
    pools = []
    for proxies in proxy():
        th = threadpool.makeRequests(check_url, (proxies,))
        pools.extend(th)
    for th in pools:
        th_pools.putRequest(th)
    th_pools.wait()

if __name__ == '__main__':
    for num in range(1, 4000):
        req(num)
    main()