1. 程式人生 > >au3抓取不得姐網站

au3抓取不得姐網站

lena www reg dex exp star gen www. desc

au3抓取不得姐網站

網站地址:http://www.budejie.com/text/

用的正則有些別扭,見笑。

代碼:

#include <IE.au3>
#include <File.au3>
#include <String.au3>
#include <Array.au3>
#include <Debug.au3>
#include <Date.au3>
;code try to collect budejie stories of www.budejie.com

Local $strUrl1 = "http://www.budejie.com/text/
" Local $filename1 = "budejie" $filename1 = $filename1 & _ & @MON $filename1 = $filename1 & @MDAY $filename1 = $filename1 & .txt Local $filesave = @TempDir & "\budejie.html" Local $pageindex Local $startindex = 1 Local $endindex = 5 Local $sHTML Local $storycount = 0 _FileCreate($filename1) Local $file
= FileOpen($filename1, 1) If $file = -1 Then MsgBox(0, "Error", "Unable to open file.") Exit EndIf For $pageindex = $startindex To $endindex Step 1 $strUrl1 = MakeUpUrl($pageindex) Local $hDownload = InetGet($strUrl1, $filesave, 1, 1) Do Sleep(250) Until InetGetInfo($hDownload,
2) Local $nBytes = InetGetInfo($hDownload, 0) InetClose($hDownload) ConsoleWrite ($pageindex & / & $endindex &" --- down bytes = " &$nBytes & @LF) $fsize = $nBytes $ftemp = FileOpen($filesave, 0) $getsize= FileGetSize ($filesave) $sHTML = FileRead($ftemp, $getsize) FileClose($ftemp) FileDelete($filesave) Local $aArray = StringRegExp($sHTML, <div class="j-r-list-c-desc">[ \n\r]+<a href="/detail-\d+.html">[^<]+(?=</a>), 3) ConsoleWrite("aArray size = " & UBound($aArray) & @CRLF) If UBound($aArray) <= 0 Then ContinueLoop EndIf $max = UBound($aArray)-1 For $i = 0 To $max Step 1 Local $item = $aArray[$i] If StringLen($item) > 0 Then $strnum = $storycount +1 $strnum = $strnum & "." &@CRLF FileWrite($file, $strnum) $storycontent = StringRegExpReplace($item,<div class="j-r-list-c-desc">[ \n\r]+<a href="/detail-\d+.html">,"") $storycontent = $storycontent & @CRLF FileWrite($file, $storycontent) $storycount = $storycount + 1 EndIf Next Next FileClose($file) MsgBox(0, "BUDEJIE", "Complete, story count = "&$storycount & , story= & $filename1) Exit Func MakeUpUrl($pagenum) If $pagenum == 1 Then $strUrl = http://www.budejie.com/text/ Else $strUrl = http://www.budejie.com/text/ & $pagenum EndIf return $strUrl EndFunc

au3抓取不得姐網站