1. 程式人生 > >python學習 —— 建立IP代理池

python學習 —— 建立IP代理池

圖片 端口 position except app rtl 分享圖片 ipp use

  代碼:

from bs4 import BeautifulSoup
from requests import Session, get, post
from time import sleep
import random
import re, os


class ProxyIpPool(object):

    def __init__(self,page):
        object.__init__(self)
        self.page = page

    def init_proxy_ip_pool(self):
        url = ‘https://www.kuaidaili.com/free/‘

        tablelist = [‘IP‘, ‘PORT‘, ‘類型‘, ‘位置‘]

        ip = []
        port = []
        type = []
        position = []

        r = Session()

        headers = {
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
            ‘Accept-Encoding‘: ‘gzip, deflate, br‘,
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘,
            ‘Connection‘:‘keep-alive‘,
            ‘Host‘: ‘www.kuaidaili.com‘,
            # ‘Referer‘: url, # 點擊下一頁時 每一頁的referer對應的url為:從前一頁的link來到當前頁的那個link。比如:從百度進入代理IP第一頁時的referer的url就是百度的link
            ‘Upgrade-Insecure-Requests‘: ‘1‘,
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36‘
        }

        if self.page > 1:
            url = url + ‘inha/‘ + str(self.page) + ‘/‘

        request = r.get(url,headers=headers,timeout=2,)
        print(request.status_code)
        soup = BeautifulSoup(request.text, ‘lxml‘)
        tags = soup.find_all(‘td‘, attrs={‘data-title‘: tablelist})

        # 獲取所有IP
        ip_tag_match = re.compile(r‘data-title="IP">(.+?)</td‘)
        ip.append(ip_tag_match.findall(str(tags)))

        # 獲取所有端口
        port_tag_match = re.compile(r‘data-title="PORT">(.+?)</td‘)
        port.append(port_tag_match.findall(str(tags)))

        # 獲取所有類型
        type_match = re.compile(r‘data-title="類型">(.+?)</td‘)
        type.append(type_match.findall(str(tags)))

        # 獲取所有位置
        position_tag_match = re.compile(r‘data-title="位置">(.+?)</td‘)
        position.append(position_tag_match.findall(str(tags)))
        sleep(random.random()*7)

        # ip、port、type、position作為字典保存
        data_title = {‘ip‘: ip, ‘port‘: port, ‘type‘: type, ‘position‘: position}
        return data_title


def create_proxy_ip_pool(page):

    pool = ProxyIpPool(page).init_proxy_ip_pool()

    print(‘初始化完成!開始創建代理池...‘)

    iplist = pool.get(‘ip‘)
    portlist = pool.get(‘port‘)
    typelsit = pool.get(‘type‘)
    positionlist = pool.get(‘position‘)

    for i in range(0, len(iplist[0])):
        print(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i])
        try:
            with open(‘C:/Users/adimin/Desktop/proxyip.txt‘,‘a‘) as fp:
                fp.write(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i] + ‘\r\n‘)
        except FileExistsError as err:
            print(err)
            os._exit(2)

if __name__ == ‘__main__‘:
    print(‘正在初始化代理池...請耐心等待...‘)

    print(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘類型‘, ‘^16‘) + format(‘位置‘, ‘^16‘))
    try:
        with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘a‘) as fp:
            fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘類型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\r\n‘)
    except:
        with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘w‘) as fp:
            fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘類型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\r\n‘)

    # 不知道為什麽只能在外面循環才能爬取多頁的IP 如果把代碼改為在init_proxy_ip_pool函數中進行循環 則只能爬一頁多一點...
    for i in range(1,2177):
        create_proxy_ip_pool(i)

  運行結果:

技術分享圖片

  保存到本地:

技術分享圖片

python學習 —— 建立IP代理池