BeautifulSoup爬蟲之儲存CSV檔案

阿新 • • 發佈：2019-01-30

爬蟲儲存資料到CSV檔案

一.閒話

一般我們寫爬蟲時都會儲存為簡單的text檔案，但是當我們爬取的資料量很大我們想方便統計或者想存長時間儲存這個時候我們怎麼辦？我們可以儲存資訊為CSV格式或者直接儲存到資料庫中。python提供了這樣的包給我們！接下來我們以“中彩網往期雙色球資訊”為例給大家演示下如何儲存資訊CSV格式。

二.幹活
依然是爬蟲三部曲：分析網頁獲取目標網址，爬取資訊，儲存資訊。
1.分析網頁:
中彩網的網址為:”http://www.zhcw.com/ssq/kaijiangshuju/index.shtml?type=0/” 雙色球往期回顧頁面如下:
這裡寫圖片描述
通過分析發現點選下一頁位址列網址沒有變化資料是動態載入的，如果爬取這一頁只能獲取一頁的資料，顯然這個地址不是最終的地址。我們接著分析網頁原始碼找到我們需要的網址:”

http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html”
依舊是分析網址我們得出他的分頁規律，為我們以後拼接頁數做準備。
2.程式碼
1）日誌檔案

#! -*- encoding:utf-8 -*-

"""
亂碼問題 解決方式一:#! -*- encoding:utf-8 -*-\
            方式二:u'哈哈哈'  字串以unicode格式儲存

"""
import logging
import getpass
import sys

class MyLog(object):
    #構造方法
    def __init__ 
(self):
        self.user=getpass.getuser()
        self.logger=logging.getLogger(self.user)
        self.logger.setLevel(   logging.DEBUG )  #日誌的級別    critical error warn info  debug

        #定義日誌檔案
        self.logFile=sys.argv[0][0:-3]+'.log'       # 從命令列引數中取出第一個引數，並取從0開始到   倒數第三個字元    拼接成檔名
        self.formatter=logging.Formatter('%(asctime) -12s %(levelname) -8s %(name) -10s %(message)-12s\r\n' 
)  #日誌輸出的格式

        #日誌輸出到檔案    logging有三個內建的Handler,
        self.logHand=logging.FileHandler(self.logFile, encoding='utf8')
        self.logHand.setFormatter(   self.formatter  )   #設定 格式
        self.logHand.setLevel(   logging.DEBUG )  #設定 級別

        #日誌輸出 到螢幕，這是標準輸出流
        self.logHandSt=logging.StreamHandler()
        self.logHandSt.setFormatter(  self.formatter )
        self.logHand.setLevel(  logging.DEBUG )

        #將兩個Handler加入到  logger中
        self.logger.addHandler(   self.logHand )
        self.logger.addHandler(    self.logHandSt )

    #重新定義logger中的日誌輸出的級別的方法
    def debug(self,msg):
        self.logger.debug(msg)

    def  info(self,msg):
        self.logger.info(msg)

    def warn(self,msg):
        self.logger.warn(msg)

    def error(self,msg):
        self.logger.error(msg)

    def critical(self,msg):
        self.logger.critical(msg)

if __name__=='__main__':
    mylog=MyLog()
    mylog.debug(u'debug測試')
    mylog.info(u'info測試')
    mylog.warn(u'warn測試')
    mylog.error(u'error測試')
    mylog.critical(u'critical測試')

2）爬蟲程式碼:

"""
   目標地址:  http://kaijiang.zhcw.com/zhcw/html/ssq/list_60.html
"""

from MyLog import MyLog
import string
from urllib.parse import quote
from urllib import error
import urllib.request
from bs4 import BeautifulSoup
import codecs
import re
from save2excel import SaveBallDate



#爬取出來的資料封裝
class DoubleColorBallItem(object):
    date=None
    order = None 
    red1 = None  
    red2 = None   
    red3 = None  
    red4 = None  
    red5 = None  
    red6 = None  
    blue=None
    money = None  
    firstPrize = None  
    province=None
    secondPrize = None

class GetDoubleColorBallNumber( object ):
    def __init__(self):
        self.urls=[]
        self.log=MyLog()
        #根據pageSum拼裝要爬取的地址
        self.getUrls()
        #開始爬取
        self.items=self.spider(   self.urls )
        #存
        self.pipelines(  self.items )
        #存入excel
        SaveBallDate( self.items)

    def getResponseContent(self,url):
        try:
            url=quote(   url, safe=string.printable )
            response=urllib.request.urlopen(   url )
        except error.URLError as e:
            self.log.error( u'python爬取 %s 出錯了' %url)
            print( e )
        else:
            self.log.info( u'python爬取 %s 成功' %url   )
            return response.read() 

    """
        1. 先獲取頁面上的總頁數
        2.  拼裝 訪問 地址
    """
    def  getUrls(self):
        url=r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'   #從第一頁中取總頁數
        htmlContent=self.getResponseContent(  url )
        soup=BeautifulSoup( htmlContent,'lxml')
        pTag=soup.find_all(    re.compile('p')      )[-1]
        pages=pTag.strong.getText().strip()
        for i in range( 1, int(pages)-107):
            url=r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+ str(i)+ '.html'
            self.urls.append( url )
            self.log.info(  u'新增url %s 到 urls列表中待爬取' %url )



    def spider(  self, urls ):
        items=[]
        for url in urls:
            htmlContent=self.getResponseContent(url)
            soup=BeautifulSoup( htmlContent,'lxml')
            tags=soup.find_all('tr',attrs={})
            for tag in tags:
                if tag.find('em'):
                    item=DoubleColorBallItem()
                    tagTd=tag.find_all('td')
                    item.date=tagTd[0].get_text().strip()
                    item.order=tagTd[1].get_text().strip()
                    tagEms=tagTd[2].find_all('em')
                    item.red1=tagEms[0].get_text().strip()
                    item.red2=tagEms[1].get_text().strip()
                    item.red3=tagEms[2].get_text().strip()
                    item.red4=tagEms[3].get_text().strip()
                    item.red5=tagEms[4].get_text().strip()
                    item.red6=tagEms[5].get_text().strip()
                    item.blue=tagEms[6].get_text().strip()
                    item.money=tagTd[3].find('strong').get_text().strip()
                    item.firstPrize=tagTd[4].find('strong').get_text().strip()
                    item.province=tagTd[4].get_text().strip()
                    item.secondPrize=tagTd[5].find('strong').get_text().strip()
                    items.append( item )
                    self.log.info( u'爬取時間為 %s 的資料成功' %(item.date))       
        return items



    def pipelines(  self, items  ):
        fileName=u'雙色球中獎資訊.txt'
        with codecs.open( fileName,'w','utf8') as fp:
            for item in items:
                fp.write('%s %s \t %s %s %s %s %s %s %s \t %s %s %s %s \n' %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.province,item.secondPrize)) 
                self.log.info(   u'期數為 %s 的雙色球資訊儲存成功' %(item.order) )

if __name__=='__main__':
     gbn=GetDoubleColorBallNumber()

3)今天的重點—->儲存為CSV檔案
python 提供了xlrd，xlwt兩個模組給我們讀寫CSV檔案具體的用法請檢視他的api（未安裝記得安裝:pip install xlwt）我這裡只講實戰嘻嘻！

#! -*- encoding:utf-8 -*-
import xlrd
import xlwt
import os
import sys

class SaveBallDate(object):
    def __init__(self,items):
        self.items=items
        self.run(self.items)

    def run(self,items):
        fileName='hello.csv'
        #建立工作簿
        book=xlwt.Workbook(encoding='utf8')
        sheet=book.add_sheet('ball',cell_overwrite_ok=True)

        #寫入
        sheet.write(0,0,u'開獎日期')
        sheet.write(0,1, u'期號')
        sheet.write(0,2, u'紅球1')
        sheet.write(0,3, u'紅球2')
        sheet.write(0,4, u'紅球3')
        sheet.write(0,5, u'紅球4')
        sheet.write(0,6, u'紅球5')
        sheet.write(0,7, u'紅球6')
        sheet.write(0,8, u'藍球')
        sheet.write(0,9, u'銷售金額')
        sheet.write(0,10, u'一等獎')
        sheet.write(0,11, u'中獎省份')
        sheet.write(0,12, u'二等獎')

        i=1
        while i<len(items):
            item=items[i-1]
            sheet.write(i,0,item.date)
            sheet.write(i, 1, item.order)
            sheet.write(i, 2, item.red1)
            sheet.write(i, 3, item.red2)
            sheet.write(i, 4, item.red3)
            sheet.write(i, 5, item.red4)
            sheet.write(i, 6, item.red5)
            sheet.write(i, 7, item.red6)
            sheet.write(i, 8, item.blue)
            sheet.write(i, 9, item.money)
            sheet.write(i, 10, item.firstPrize)
            sheet.write(i, 11, item.province)
            sheet.write(i, 12, item.secondPrize)
            i+=1
        book.save(fileName)

結果：
這裡寫圖片描述

記得匯入from save2excel import SaveBallDate以及在init函式新增儲存CSV的函式
這裡寫圖片描述

下一次我們講解如何將爬取的資料儲存到mysq資料庫當中！！！

BeautifulSoup爬蟲之儲存CSV檔案

爬蟲儲存資料到CSV檔案

BeautifulSoup爬蟲之儲存CSV檔案

plsql 儲存csv檔案到oracle 資料庫中。

PHP強化之10 - CSV檔案處理

plsql 儲存csv檔案到oracle 資料庫中。

Python3使用csv模組csv.writer().writerow()儲存csv檔案，產生空行的問題

Spring batch教程之讀取CSV檔案並寫入MySQL資料庫

TestNg引數化測試之讀取csv檔案

Python人臉識別之——建立csv檔案 create_csv.py 程式碼 Python3.7

Jmeter介面自動化例項(使用Beanshell儲存csv檔案、csv引數化、setUp執行緒組)

爬蟲資料儲存為csv檔案時，表格中間隔有空行問題

【Python3 爬蟲學習筆記】資料儲存 3 -- CSV檔案儲存

【Python3 爬蟲學習筆記】資料儲存 3 -- CSV檔案儲存 1

scrapy爬蟲儲存為csv檔案的技術分析

Python爬蟲學習6：scrapy入門（一）爬取汽車評論並儲存到csv檔案

爬蟲之BeautifulSoup， CSS

Python爬蟲之利用BeautifulSoup爬取豆瓣小說（三）——將小說信息寫入文件

Python開發爬蟲之BeautifulSoup解析網頁篇：爬取安居客網站上北京二手房數據

爬蟲之beautifulsoup模塊

python 爬蟲之BeautifulSoup 庫的基本使用

運用python將json檔案儲存成csv檔案

BeautifulSoup爬蟲之儲存CSV檔案

爬蟲儲存資料到CSV檔案

相關推薦