1. 程式人生 > >python3[爬蟲實戰] 爬蟲之requests爬取新浪微博京東客服

python3[爬蟲實戰] 爬蟲之requests爬取新浪微博京東客服

爬取的內容為京東客服的微博及評論

思路:主要是通過手機端訪問新浪微博的api介面,然後進行資料的篩選,

這個主要是登陸上去的微博的url連結,

可以看到的介面:

這裡寫圖片描述

這裡主要爬取的內容為:

說說,說說下面的評論條目

雖然很簡單,但是,不得不說句mmp,爬取的過程很坎坷,現在是一直在ip上,另外,個人經過嘗試,睡眠時間30秒一次也不是很好的效果, 睡眠10秒就足夠了,可能該封你的ip還是會封的,我這問題應該封ip的情況

爬取的方法主要是通過手機端api進行json資料的獲取,然後進行資料的提取。

這裡可以使用火狐fox的外掛使用:

主要api:

說說API:

類似於這樣子的,

詳情評論內容API:

在每條評論下會有一個idstr:4137390568546147

帶大家看一下評論api下返回的資料:JSON格式的

{
    "cardlistInfo": {
        "containerid": "1076035650743478",
        "v_p": 42,
        "show_style": 1,
        "total": 3264,
        "page": 2
    },
    "cards": [
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4137858652321796",
            "scheme": "https://m.weibo.cn/status/FfSSl9K0k?mblogid=FfSSl9K0k&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "2小時前",
                "id": "4137858652321796",
                "mid": "4137858652321796",
                "idstr": "4137858652321796",
                "text": "明天又要上班了,用四個字描述下你現在的心情吧<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/d_erha-0d2bea3a7d.png\" style=\"width:1em;height:1em;\" alt=\"[二哈]\"></span> ​​​",
                "textLength": 50,
                "source": "微博 weibo.com",
                "favorited": false,
                "thumbnail_pic": "http://wx4.sinaimg.cn/thumbnail/006apWvQgy1fi7tkjguy4j309q09qt8q.jpg",
                "bmiddle_pic": "http://wx4.sinaimg.cn/bmiddle/006apWvQgy1fi7tkjguy4j309q09qt8q.jpg",
                "original_pic": "http://wx4.sinaimg.cn/large/006apWvQgy1fi7tkjguy4j309q09qt8q.jpg",
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "reposts_count": 0,
                "comments_count": 4,
                "attitudes_count": 2,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "bid": "FfSSl9K0k",
                "pics": [
                    {
                        "pid": "006apWvQgy1fi7tkjguy4j309q09qt8q",
                        "url": "https://wx4.sinaimg.cn/orj360/006apWvQgy1fi7tkjguy4j309q09qt8q.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": "350",
                            "height": "350",
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx4.sinaimg.cn/large/006apWvQgy1fi7tkjguy4j309q09qt8q.jpg",
                            "geo": {
                                "width": "350",
                                "height": "350",
                                "croped": false
                            }
                        }
                    }
                ]
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4137692553365577",
            "scheme": "https://m.weibo.cn/status/FfOyre7xv?mblogid=FfOyre7xv&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "13小時前",
                "id": "4137692553365577",
                "mid": "4137692553365577",
                "idstr": "4137692553365577",
                "text": "你覺得舉辦哪種《中國有_____》比賽,你能進入決賽? ​​​",
                "textLength": 49,
                "source": "微博 weibo.com",
                "favorited": false,
                "thumbnail_pic": "http://wx2.sinaimg.cn/thumbnail/006apWvQgy1fi7ul9n9rfj30k00lsgnj.jpg",
                "bmiddle_pic": "http://wx2.sinaimg.cn/bmiddle/006apWvQgy1fi7ul9n9rfj30k00lsgnj.jpg",
                "original_pic": "http://wx2.sinaimg.cn/large/006apWvQgy1fi7ul9n9rfj30k00lsgnj.jpg",
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "reposts_count": 0,
                "comments_count": 13,
                "attitudes_count": 1,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "bid": "FfOyre7xv",
                "pics": [
                    {
                        "pid": "006apWvQgy1fi7ul9n9rfj30k00lsgnj",
                        "url": "https://wx2.sinaimg.cn/orj360/006apWvQgy1fi7ul9n9rfj30k00lsgnj.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 360,
                            "height": 392,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx2.sinaimg.cn/large/006apWvQgy1fi7ul9n9rfj30k00lsgnj.jpg",
                            "geo": {
                                "width": "720",
                                "height": "784",
                                "croped": false
                            }
                        }
                    }
                ]
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4137390568546147",
            "scheme": "https://m.weibo.cn/status/FfGHmzRf5?mblogid=FfGHmzRf5&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "昨天 14:24",
                "id": "4137390568546147",
                "mid": "4137390568546147",
                "idstr": "4137390568546147",
                "text": "週末就是買買買,吃吃吃<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_huaixiao-bb5966dcc6.png\" style=\"width:1em;height:1em;\" alt=\"[壞笑]\"></span> ​​​",
                "textLength": 28,
                "source": "微博 weibo.com",
                "favorited": false,
                "thumbnail_pic": "http://wx2.sinaimg.cn/thumbnail/006apWvQgy1fi7taijr9pg307e05kgvl.gif",
                "bmiddle_pic": "http://wx2.sinaimg.cn/bmiddle/006apWvQgy1fi7taijr9pg307e05kgvl.gif",
                "original_pic": "http://wx2.sinaimg.cn/large/006apWvQgy1fi7taijr9pg307e05kgvl.gif",
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "reposts_count": 0,
                "comments_count": 19,
                "attitudes_count": 1,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "bid": "FfGHmzRf5",
                "pics": [
                    {
                        "pid": "006apWvQgy1fi7taijr9pg307e05kgvl",
                        "url": "https://wx2.sinaimg.cn/orj360/006apWvQgy1fi7taijr9pg307e05kgvl.gif",
                        "size": "orj360",
                        "geo": {
                            "width": "266",
                            "height": "200",
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx2.sinaimg.cn/large/006apWvQgy1fi7taijr9pg307e05kgvl.gif",
                            "geo": {
                                "width": "266",
                                "height": "200",
                                "croped": false
                            }
                        }
                    }
                ]
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4137278329132849",
            "scheme": "https://m.weibo.cn/status/FfDMkCjS1?mblogid=FfDMkCjS1&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "昨天 06:58",
                "id": "4137278329132849",
                "mid": "4137278329132849",
                "idstr": "4137278329132849",
                "text": "週六早呀,今天有比我起的還早的嗎<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_wabishi-f5765407f7.png\" style=\"width:1em;height:1em;\" alt=\"[挖鼻]\"></span> ​​​​",
                "textLength": 47,
                "source": "微博 weibo.com",
                "favorited": false,
                "thumbnail_pic": "http://wx4.sinaimg.cn/thumbnail/006apWvQgy1fi7tiv5e5qj30dc0d5dfz.jpg",
                "bmiddle_pic": "http://wx4.sinaimg.cn/bmiddle/006apWvQgy1fi7tiv5e5qj30dc0d5dfz.jpg",
                "original_pic": "http://wx4.sinaimg.cn/large/006apWvQgy1fi7tiv5e5qj30dc0d5dfz.jpg",
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "reposts_count": 0,
                "comments_count": 8,
                "attitudes_count": 2,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "bid": "FfDMkCjS1",
                "pics": [
                    {
                        "pid": "006apWvQgy1fi7tiv5e5qj30dc0d5dfz",
                        "url": "https://wx4.sinaimg.cn/orj360/006apWvQgy1fi7tiv5e5qj30dc0d5dfz.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 273,
                            "height": 270,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx4.sinaimg.cn/large/006apWvQgy1fi7tiv5e5qj30dc0d5dfz.jpg",
                            "geo": {
                                "width": "480",
                                "height": "473",
                                "croped": false
                            }
                        }
                    }
                ]
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4137054743266182",
            "scheme": "https://m.weibo.cn/status/FfxXIdHGm?mblogid=FfxXIdHGm&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "08-04",
                "id": "4137054743266182",
                "mid": "4137054743266182",
                "idstr": "4137054743266182",
                "text": "就問一句,這樣人美心善的90後小哥你們要不要?<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_tian-52ea252705.png\" style=\"width:1em;height:1em;\" alt=\"[舔屏]\"></span><span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_tian-52ea252705.png\" style=\"width:1em;height:1em;\" alt=\"[舔屏]\"></span>",
                "source": "微博 weibo.com",
                "favorited": false,
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "retweeted_status": {
                    "created_at": "08-04",
                    "id": "4137016583280831",
                    "mid": "4137016583280831",
                    "idstr": "4137016583280831",
                    "text": "<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_tian-52ea252705.png\" style=\"width:1em;height:1em;\" alt=\"[舔屏]\"></span><span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_tian-52ea252705.png\" style=\"width:1em;height:1em;\" alt=\"[舔屏]\"></span><span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_tian-52ea252705.png\" style=\"width:1em;height:1em;\" alt=\"[舔屏]\"></span> <a data-url=\"http://t.cn/R9S6VWV\" href=\"http://media.weibo.cn/article?object_id=1022%3A2309404137016584472707&url_type=39&object_type=article&pos=1&luicode=10000011&lfid=1076035650743478&featurecode=20000320&id=2309404137016584472707&ep=FfwYadLuD%2C1717871843%2CFfwYadLuD%2C1717871843\" data-hide=\"\"><span class=\"url-icon\"><img src=\"https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_article_default.png\"></span></i><span class=\"surl-text\">90後小哥徵婚啟事</a> ​​​",
                    "textLength": 38,
                    "source": "微博 weibo.com",
                    "favorited": false,
                    "user": {
                        "id": 1717871843,
                        "screen_name": "京東",
                        "profile_image_url": "https://tvax4.sinaimg.cn/crop.0.0.480.480.180/6664a4e3ly8fffaxrnv8fj20dc0dcmy4.jpg",
                        "profile_url": "https://m.weibo.cn/u/1717871843?uid=1717871843&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                        "statuses_count": 19903,
                        "verified": true,
                        "verified_type": 2,
                        "verified_type_ext": 50,
                        "verified_reason": "京東網上商城",
                        "description": "中國最大的自營電商企業京東商城集團線上銷售家電、數碼通訊、電腦、家居百貨、服裝服飾、母嬰、圖書、食品等13大類數萬個品牌上千萬種優質商品。",
                        "gender": "m",
                        "mbtype": 12,
                        "urank": 43,
                        "mbrank": 5,
                        "follow_me": false,
                        "following": false,
                        "followers_count": 4025036,
                        "follow_count": 258,
                        "cover_image_phone": "https://wx1.sinaimg.cn/crop.0.0.640.640.640/6664a4e3ly1fffb8torrtj20ku0ku409.jpg"
                    },
                    "reposts_count": 12,
                    "comments_count": 24,
                    "attitudes_count": 16,
                    "isLongText": false,
                    "visible": {
                        "type": 0,
                        "list_id": 0
                    },
                    "page_info": {
                        "page_pic": {
                            "url": "https://wx3.sinaimg.cn/crop.0.0.617.347.1000/6664a4e3ly1fi7khoua7dj20hk09nn45.jpg"
                        },
                        "page_url": "http://media.weibo.cn/article?object_id=1022%3A2309404137016584472707&url_type=39&object_type=article&pos=2&luicode=10000011&lfid=1076035650743478&featurecode=20000320&id=2309404137016584472707",
                        "page_title": "京東",
                        "content1": "90後小哥徵婚啟事",
                        "content2": "",
                        "icon": "https://h5.sinaimg.cn/upload/2016/12/28/14/feed_headlines_icon_flash20161228_2.png",
                        "type": "article"
                    },
                    "bid": "FfwYadLuD"
                },
                "reposts_count": 0,
                "comments_count": 30,
                "attitudes_count": 1,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "raw_text": "就問一句,這樣人美心善的90後小哥你們要不要?[舔屏][舔屏]",
                "bid": "FfxXIdHGm"
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4136952959746775",
            "scheme": "https://m.weibo.cn/status/FfvjxETA3?mblogid=FfvjxETA3&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "08-04",
                "id": "4136952959746775",
                "mid": "4136952959746775",
                "idstr": "4136952959746775",
                "text": "週五早上上班的你和下班的你<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_xiaoku-7430606cb7.png\" style=\"width:1em;height:1em;\" alt=\"[笑cry]\"></span> ​​​",
                "textLength": 33,
                "source": "微博 weibo.com",
                "favorited": false,
                "thumbnail_pic": "http://wx1.sinaimg.cn/thumbnail/006apWvQgy1fi7fkqpatfj30j60j6jsg.jpg",
                "bmiddle_pic": "http://wx1.sinaimg.cn/bmiddle/006apWvQgy1fi7fkqpatfj30j60j6jsg.jpg",
                "original_pic": "http://wx1.sinaimg.cn/large/006apWvQgy1fi7fkqpatfj30j60j6jsg.jpg",
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "reposts_count": 0,
                "comments_count": 14,
                "attitudes_count": 1,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "bid": "FfvjxETA3",
                "pics": [
                    {
                        "pid": "006apWvQgy1fi7fkqpatfj30j60j6jsg",
                        "url": "https://wx1.sinaimg.cn/orj360/006apWvQgy1fi7fkqpatfj30j60j6jsg.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 360,
                            "height": 360,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx1.sinaimg.cn/large/006apWvQgy1fi7fkqpatfj30j60j6jsg.jpg",
                            "geo": {
                                "width": "690",
                                "height": "690",
                                "croped": false
                            }
                        }
                    },
                    {
                        "pid": "006apWvQgy1fi7fkuj1tvg308c0fkmxy",
                        "url": "https://wx1.sinaimg.cn/orj360/006apWvQgy1fi7fkuj1tvg308c0fkmxy.gif",
                        "size": "orj360",
                        "geo": {
                            "width": "300",
                            "height": "560",
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx1.sinaimg.cn/large/006apWvQgy1fi7fkuj1tvg308c0fkmxy.gif",
                            "geo": {
                                "width": "300",
                                "height": "560",
                                "croped": false
                            }
                        }
                    }
                ]
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4136663145262324",
            "scheme": "https://m.weibo.cn/status/FfnM6m4Yc?mblogid=FfnM6m4Yc&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "08-03",
                "id": "4136663145262324",
                "mid": "4136663145262324",
                "idstr": "4136663145262324",
                "text": "輸入法,你們喜歡用哪種?<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/d_doge-d903433c82.png\" style=\"width:1em;height:1em;\" alt=\"[doge]\"></span> ​​​",
                "textLength": 30,
                "source": "微博 weibo.com",
                "favorited": false,
                "thumbnail_pic": "http://wx4.sinaimg.cn/thumbnail/006apWvQgy1fi6i8tkspqj30ku0i7mz4.jpg",
                "bmiddle_pic": "http://wx4.sinaimg.cn/bmiddle/006apWvQgy1fi6i8tkspqj30ku0i7mz4.jpg",
                "original_pic": "http://wx4.sinaimg.cn/large/006apWvQgy1fi6i8tkspqj30ku0i7mz4.jpg",
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "reposts_count": 4,
                "comments_count": 40,
                "attitudes_count": 6,
                "isLongText": false,
                "visible": {
                    "type": 0,
                    "list_id": 0
                },
                "mblogtype": 0,
                "bid": "FfnM6m4Yc",
                "pics": [
                    {
                        "pid": "006apWvQgy1fi6i8tkspqj30ku0i7mz4",
                        "url": "https://wx4.sinaimg.cn/orj360/006apWvQgy1fi6i8tkspqj30ku0i7mz4.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 309,
                            "height": 270,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx4.sinaimg.cn/large/006apWvQgy1fi6i8tkspqj30ku0i7mz4.jpg",
                            "geo": {
                                "width": "750",
                                "height": "655",
                                "croped": false
                            }
                        }
                    },
                    {
                        "pid": "006apWvQgy1fi6i8z010xj30ku0h6jte",
                        "url": "https://wx3.sinaimg.cn/orj360/006apWvQgy1fi6i8z010xj30ku0h6jte.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 327,
                            "height": 270,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx3.sinaimg.cn/large/006apWvQgy1fi6i8z010xj30ku0h6jte.jpg",
                            "geo": {
                                "width": "750",
                                "height": "618",
                                "croped": false
                            }
                        }
                    },
                    {
                        "pid": "006apWvQgy1fi6i988w7pj30kt0hbgms",
                        "url": "https://wx2.sinaimg.cn/orj360/006apWvQgy1fi6i988w7pj30kt0hbgms.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 324,
                            "height": 270,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx2.sinaimg.cn/large/006apWvQgy1fi6i988w7pj30kt0hbgms.jpg",
                            "geo": {
                                "width": "749",
                                "height": "623",
                                "croped": false
                            }
                        }
                    },
                    {
                        "pid": "006apWvQgy1fi6i9bnkgfj30ku0gwgmj",
                        "url": "https://wx2.sinaimg.cn/orj360/006apWvQgy1fi6i9bnkgfj30ku0gwgmj.jpg",
                        "size": "orj360",
                        "geo": {
                            "width": 333,
                            "height": 270,
                            "croped": false
                        },
                        "large": {
                            "size": "large",
                            "url": "https://wx2.sinaimg.cn/large/006apWvQgy1fi6i9bnkgfj30ku0gwgmj.jpg",
                            "geo": {
                                "width": "750",
                                "height": "608",
                                "croped": false
                            }
                        }
                    }
                ]
            },
            "show_type": 0,
            "openurl": ""
        },
        {
            "card_type": 9,
            "itemid": "1076035650743478_-_4136613988263792",
            "scheme": "https://m.weibo.cn/status/FfmuOyFMY?mblogid=FfmuOyFMY&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
            "mblog": {
                "created_at": "08-03",
                "id": "4136613988263792",
                "mid": "4136613988263792",
                "idstr": "4136613988263792",
                "text": "<a class='k' href='https://m.weibo.cn/k/%E5%BC%A0%E8%8B%A5%E6%98%80%E5%94%90%E8%89%BA%E6%98%95%E5%85%AC%E5%BC%80%E6%81%8B%E6%83%85?from=feed'>#張若昀唐藝昕公開戀情#</a> 恭喜呀<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/l_xin-8e9a1a0346.png\" style=\"width:1em;height:1em;\" alt=\"[心]\"></span><span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/l_xin-8e9a1a0346.png\" style=\"width:1em;height:1em;\" alt=\"[心]\"></span><span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/l_xin-8e9a1a0346.png\" style=\"width:1em;height:1em;\" alt=\"[心]\"></span>,大家就默默乾了這碗狗糧吧,狗糧夠吃嗎?不夠吃的話,你(jing)們(dong)懂(you)的(shou)<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/default/d_wabishi-f5765407f7.png\" style=\"width:1em;height:1em;\" alt=\"[挖鼻]\"></span>",
                "source": "微博 weibo.com",
                "favorited": false,
                "user": {
                    "id": 5650743478,
                    "screen_name": "京東客服",
                    "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg",
                    "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                    "statuses_count": 3245,
                    "verified": true,
                    "verified_type": 2,
                    "verified_type_ext": 0,
                    "verified_reason": "北京京東世紀貿易有限公司",
                    "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服",
                    "gender": "f",
                    "mbtype": 2,
                    "urank": 29,
                    "mbrank": 2,
                    "follow_me": false,
                    "following": false,
                    "followers_count": 18427,
                    "follow_count": 235,
                    "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg"
                },
                "retweeted_status": {
                    "created_at": "08-02",
                    "id": "4136423907632073",
                    "mid": "4136423907632073",
                    "idstr": "4136423907632073",
                    "text": "時光賜給我們盜不走的愛人,而你賜給我時光。<a href='https://m.weibo.cn/n/唐藝昕'>@唐藝昕</a> ​​​",
                    "textLength": 49,
                    "source": "iPhone 6s",
                    "favorited": false,
                    "thumbnail_pic": "http://wx1.sinaimg.cn/thumbnail/6cf03c75ly1fi5qtg3z8fj20hs0nqq46.jpg",
                    "bmiddle_pic": "http://wx1.sinaimg.cn/bmiddle/6cf03c75ly1fi5qtg3z8fj20hs0nqq46.jpg",
                    "original_pic": "http://wx1.sinaimg.cn/large/6cf03c75ly1fi5qtg3z8fj20hs0nqq46.jpg",
                    "user": {
                        "id": 1827683445,
                        "screen_name": "張若昀",
                        "profile_image_url": "https://tva3.sinaimg.cn/crop.9.0.494.494.180/6cf03c75jw8fajncv51lvj20e80dq74i.jpg",
                        "profile_url": "https://m.weibo.cn/u/1827683445?uid=1827683445&luicode=10000011&lfid=1076035650743478&featurecode=20000320",
                        "statuses_count": 1199,
                        "verified": true,
                        "verified_type": 0,
                        "verified_type_ext": 1,
                        "verified_reason": "演員張若昀",
                        "description": "Per Aspera Ad Astra 循此苦旅,以達天際。 工作郵箱:
[email protected]
", "gender": "m", "mbtype": 12, "urank": 37, "mbrank": 6, "follow_me": false, "following": false, "followers_count": 13527839, "follow_count": 195, "cover_image_phone": "https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg" }, "picStatus": "0:1,1:1", "reposts_count": 283896, "comments_count": 325438, "attitudes_count": 2380726, "isLongText": false, "visible": { "type": 0, "list_id": 0 }, "cardid": "star_183", "bid": "Ffhyew1rX", "pics": [ { "pid": "6cf03c75ly1fi5qtg3z8fj20hs0nqq46", "url": "https://wx1.sinaimg.cn/orj360/6cf03c75ly1fi5qtg3z8fj20hs0nqq46.jpg", "size": "orj360", "geo": { "width": 360, "height": 480, "croped": false }, "large": { "size": "large", "url": "https://wx1.sinaimg.cn/large/6cf03c75ly1fi5qtg3z8fj20hs0nqq46.jpg", "geo": { "width": "640", "height": "854", "croped": false } } }, { "pid": "6cf03c75ly1fi5qtfv90rj20c80c6dgs", "url": "https://wx1.sinaimg.cn/orj360/6cf03c75ly1fi5qtfv90rj20c80c6dgs.jpg", "size": "orj360", "geo": { "width": 271, "height": 270, "croped": false }, "large": { "size": "large", "url": "https://wx1.sinaimg.cn/large/6cf03c75ly1fi5qtfv90rj20c80c6dgs.jpg", "geo": { "width": "440", "height": "438", "croped": false } } } ] }, "reposts_count": 3, "comments_count": 13, "attitudes_count": 6, "isLongText": false, "visible": { "type": 0, "list_id": 0 }, "mblogtype": 0, "raw_text": "#張若昀唐藝昕公開戀情# 恭喜呀[心][心][心],大家就默默乾了這碗狗糧吧,狗糧夠吃嗎?不夠吃的話,你(jing)們(dong)懂(you)的(shou)[挖鼻]", "bid": "FfmuOyFMY" }, "show_type": 0, "openurl": "" }, { "card_type": 9, "itemid": "1076035650743478_-_4136598981629551", "scheme": "https://m.weibo.cn/status/Ffm6C6PV5?mblogid=Ffm6C6PV5&luicode=10000011&lfid=1076035650743478&featurecode=20000320", "mblog": { "created_at": "08-03", "id": "4136598981629551", "mid": "4136598981629551", "idstr": "4136598981629551", "text": "彷彿看到了自己<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/d_erha-0d2bea3a7d.png\" style=\"width:1em;height:1em;\" alt=\"[二哈]\"></span>", "source": "微博 weibo.com", "favorited": false, "user": { "id": 5650743478, "screen_name": "京東客服", "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg", "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320", "statuses_count": 3245, "verified": true, "verified_type": 2, "verified_type_ext": 0, "verified_reason": "北京京東世紀貿易有限公司", "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服", "gender": "f", "mbtype": 2, "urank": 29, "mbrank": 2, "follow_me": false, "following": false, "followers_count": 18427, "follow_count": 235, "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg" }, "retweeted_status": { "created_at": "08-02", "id": "4136434165892638", "mid": "4136434165892638", "idstr": "4136434165892638", "text": "我在張若昀和唐藝昕公開戀情的微博裡看到了你唉~~<span class=\"url-icon\"><img src=\"//h5.sinaimg.cn/m/emoticon/icon/others/d_doge-d903433c82.png\" style=\"width:1em;height:1em;\" alt=\"[doge]\"></span> ​​​", "textLength": 54, "source": "", "favorited": false, "thumbnail_pic": "http://wx3.sinaimg.cn/thumbnail/bb97de37ly1fi5s0g76jrj20yi0p1n0m.jpg", "bmiddle_pic": "http://wx3.sinaimg.cn/bmiddle/bb97de37ly1fi5s0g76jrj20yi0p1n0m.jpg", "original_pic": "http://wx3.sinaimg.cn/large/bb97de37ly1fi5s0g76jrj20yi0p1n0m.jpg", "user": { "id": 3147292215, "screen_name": "草圖君", "profile_image_url": "https://tva4.sinaimg.cn/crop.0.0.511.511.180/bb97de37jw8f57ewfuqt9j20e70e8q37.jpg", "profile_url": "https://m.weibo.cn/u/3147292215?uid=3147292215&luicode=10000011&lfid=1076035650743478&featurecode=20000320", "statuses_count": 5980, "verified": true, "verified_type": 0, "verified_type_ext": 1, "verified_reason": "直播紅人 微博知名綜藝博主", "description": "一個得罪了半個娛樂圈的少年", "gender": "m", "mbtype": 12, "urank": 44, "mbrank": 6, "follow_me": false, "following": false, "followers_count": 6192418, "follow_count": 433, "cover_image_phone": "https://tva2.sinaimg.cn/crop.0.0.640.640.640/bb97de37jw1ewysfmiioyj20yi0ykqe7.jpg" }, "picStatus": "0:1,1:1,2:1,3:1", "reposts_count": 3832, "comments_count": 7349, "attitudes_count": 65785, "isLongText": false, "visible": { "type": 0, "list_id": 0 }, "bid": "FfhOMoIWy", "pics": [ { "pid": "bb97de37ly1fi5s0g76jrj20yi0p1n0m", "url": "https://wx3.sinaimg.cn/orj360/bb97de37ly1fi5s0g76jrj20yi0p1n0m.jpg", "size": "orj360", "geo": { "width": 372, "height": 270, "croped": false }, "large": { "size": "large", "url": "https://wx3.sinaimg.cn/large/bb97de37ly1fi5s0g76jrj20yi0p1n0m.jpg", "geo": { "width": "1242", "height": "901", "croped": false } } }, { "pid": "bb97de37ly1fi5s0goz0nj20hs0nq0tw", "url": "https://wx4.sinaimg.cn/orj360/bb97de37ly1fi5s0goz0nj20hs0nq0tw.jpg", "size": "orj360", "geo": { "width": 360, "height": 480, "croped": false }, "large": { "size": "large", "url": "https://wx4.sinaimg.cn/large/bb97de37ly1fi5s0goz0nj20hs0nq0tw.jpg", "geo": { "width": "640", "height": "854", "croped": false } } }, { "pid": "bb97de37ly1fi5s0h69g3j20c80c7juk", "url": "https://wx1.sinaimg.cn/orj360/bb97de37ly1fi5s0h69g3j20c80c7juk.jpg", "size": "orj360", "geo": { "width": 270, "height": 270, "croped": false }, "large": { "size": "large", "url": "https://wx1.sinaimg.cn/large/bb97de37ly1fi5s0h69g3j20c80c7juk.jpg", "geo": { "width": "440", "height": "439", "croped": false } } }, { "pid": "bb97de37ly1fi5s0fg68mj202g02g3yo", "url": "https://wx1.sinaimg.cn/orj360/bb97de37ly1fi5s0fg68mj202g02g3yo.jpg", "size": "orj360", "geo": { "width": "88", "height": "88", "croped": false }, "large": { "size": "large", "url": "https://wx1.sinaimg.cn/large/bb97de37ly1fi5s0fg68mj202g02g3yo.jpg", "geo": { "width": "88", "height": "88", "croped": false } } } ] }, "reposts_count": 2, "comments_count": 21, "attitudes_count": 7, "isLongText": false, "visible": { "type": 0, "list_id": 0 }, "mblogtype": 0, "raw_text": "彷彿看到了自己[二哈]", "bid": "Ffm6C6PV5" }, "show_type": 0, "openurl": "" }, { "card_type": 11, "show_type": 0, "card_group": [], "openurl": "" }, { "card_type": 9, "itemid": "1076035650743478_-_4136407577953610", "scheme": "https://m.weibo.cn/status/Ffh7Txn62?mblogid=Ffh7Txn62&luicode=10000011&lfid=1076035650743478&featurecode=20000320", "mblog": { "created_at": "08-02", "id": "4136407577953610", "mid": "4136407577953610", "idstr": "4136407577953610", "text": "<a class='k' href='https://m.weibo.cn/k/%E4%B8%80%E4%B8%AA%E6%84%9F%E4%BA%BA%E7%9A%84%E6%95%85%E4%BA%8B?from=feed'>#一個感人的故事#</a>去年暑假,8歲的小明特意坐了三個多小時車去奶奶家;奶奶為了小明也願意去縣城的超市買小明愛的薯片和巧克力等零食,但是奶奶家沒有WiFi和智慧手機,奶奶可以陪他一起看古裝電視劇;講他最愛聽的神話故事,唱小曲哄他睡覺……奶奶家有吃不完的零食,也不會&quot;太無聊了&quot;<br/>今年,奶奶提前做 ​​​...<a href=\"/status/4136407577953610\">全文</a>", "textLength": 393, "source": "微博 weibo.com", "favorited": false, "user": { "id": 5650743478, "screen_name": "京東客服", "profile_image_url": "https://tva4.sinaimg.cn/crop.38.7.206.206.180/006apWvQjw8f9dwuejt68j307y0630sz.jpg", "profile_url": "https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=1076035650743478&featurecode=20000320", "statuses_count": 3245, "verified": true, "verified_type": 2, "verified_type_ext": 0, "verified_reason": "北京京東世紀貿易有限公司", "description": "訂單諮詢、問題反饋、意見建議……獲取專業貼心服務,盡在京東客服", "gender": "f", "mbtype": 2, "urank": 29, "mbrank": 2, "follow_me": false, "following": false, "followers_count": 18427, "follow_count": 235, "cover_image_phone": "https://tva4.sinaimg.cn/crop.0.0.640.640.640/006apWvQjw1f2g20q03tbj30e80e8t93.jpg" }, "reposts_count": 6, "comments_count": 17, "attitudes_count": 2, "isLongText": true, "visible": { "type": 0, "list_id": 0 }, "mblogtype": 0, "page_info": { "page_pic": { "url": "https://ww3.sinaimg.cn/thumb180/74f67c55jw9ey0hrixq57j2050050t92.jpg" }, "page_url": "https://m.weibo.cn/p/index?containerid=100808f50fb5741ffd610570b92baf2cc3b342&extparam=%E4%B8%80%E4%B8%AA%E6%84%9F%E4%BA%BA%E7%9A%84%E6%95%85%E4%BA%8B&luicode=10000011&lfid=1076035650743478&featurecode=20000320", "page_title": "#一個感人的故事#", "content1": "", "content2": "3人關注", "type": "topic" }, "bid": "Ffh7Txn62" }, "show_type": 0, "openurl": "" } ], "ok": 1, "showAppTips": 0, "scheme": "sinaweibo://cardlist?containerid=1076035650743478&luicode=10000011&lfid=100103type=1&q=京東客服&featurecode=20000320" }

###上面只是一個頁面的說說,估計寫前端移動端的要暈死,好惡心,要是返回個null或者空回來。。

上面程式碼可以直接在jsonview裡面進行格式化,

這裡寫圖片描述

爬取的欄位是:cards 下面的mblog下面的:text ,idstr(拼接評論頁的)

這裡的id就是idstr

詳情頁就是上面評論條目的json串,搞下來也是一大把,跟上面的差不多,詳情頁裡面的資料跟評論頁的資料差不多,這裡就不再繼續多些了,因為上面的內容已經佔用的差不多了

因為微博的封IP地址的原因,所以第一次爬取了4w多資料,就GG了,第二天晚上睡眠30秒,爬取一條,發現,毛用也沒有,只好是接著爬,ip不封了之後換了cookie,換了starturl,換了page索引繼續爬取,也睡眠了10秒,反正睡多了也沒用,最後爬取的垃圾資料有22萬左右吧,去掉去重不要的估計也就4000不知道有沒有,反正也沒數。

附上幾張爬蟲過程中的圖片截圖:

這裡寫圖片描述

這裡寫圖片描述

這裡寫圖片描述

最後是微博資料的結果圖片:

這裡寫圖片描述

這裡的程式碼上傳到github上了,有需要的話可以自己去下載,另外寫了一份類似於 爬取新浪微博京東客服 @京東客服的簡單爬蟲。

發一下牢騷,json串又多又大又不穩定,返回不一致

貼上部分程式碼:

# encoding=utf8
import requests
import json
import re
import time

startUrl = 'https://m.weibo.cn/api/container/getIndex?uid=5650743478&luicode=10000011&lfid=100103type%3D1%26q%3D%E4%BA%AC%E4%B8%9C%E5%AE%A2%E6%9C%8D&featurecode=20000320&type=uid&value=5650743478&containerid=1076035650743478'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
    ,
    'Cookie': 'ALF=1504709445; SCF=Ag0epa_4tyFCglnCwHJiaRDznUy645wpqEhg-dG3Sv0cbfGX1wNmqXPnHQroard1FW2nn3RdCnmux4VZ7bFRuMo.; SUHB=0ebt4qVvtKU1d7; _T_WM=22bb4d80315608a0e9bd3bf92b3c1dac; SUB=_2A250jA4VDeRhGeBN6FsT8i7MyTyIHXVXjpJdrDV6PUJbktBeLXjBkW1oTOqmqg0rff3UmekP4TzhMFYtsw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFNrBkhSeVrfPGckwnaFCcy5JpX5o2p5NHD95Qce0e4eoz7ehz7Ws4DqcjBIcHVdr.peoepeoefeK5Ee5tt; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%2540%25E4%25BA%25AC%25E4%25B8%259C%25E5%25AE%25A2%25E6%259C%258D%26featurecode%3D20000320%26fid%3D1076035650743478%26uicode%3D10000011'
    ,
    'Host':'m.weibo.cn'
    ,
'Accept':'application/json, text/plain, */*',
    'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    'Accept-Encoding':'gzip, deflate, br',
    'X-Requested-With':'XMLHttpRequest',
    'Referer':'https://m.weibo.cn/u/5650743478?uid=5650743478&luicode=10000011&lfid=100103type%3D1%26q%3D%40%E4%BA%AC%E4%B8%9C%E5%AE%A2%E6%9C%8D&featurecode=20000320',

}

# 詳情頁list
detaiList = []
# 說說
textList = []
# 說說跟詳情頁
textAnddetailList = []
# 評論數,詳情頁返回的是每一頁10個
commentsList = []

numSizeList  = []

detaiLinks = []
def getJsonData(url):
    req = requests.get(url, headers=headers)
    # print(req.text)
    return req.text


jsonData = getJsonData(startUrl)


def parseDetailListdata(listdata):
    for detailData in listdata:
        text = detailData['text'] if 'text' in detailData else ""
        reply_text = detailData['reply_text'] if 'reply_text' in detailData else ""
        f.write(text+'\r\n')
        print(text)
        print(reply_text)
        f.write(reply_text + '\r\n')
    # pass


def parseJsonData(jsonData):
    global pagedetail
    jsondata = json.loads(jsonData, 'utf-8')
    print(jsondata)
    listdata = jsondata['cards']if 'cards' in jsondata else ""
    print(listdata)

    for datainfo in listdata:
        # print(datainfo)
        mblog = datainfo['mblog'] if 'mblog' in datainfo else ""

        # print(mblog)
        if len(mblog)> 0 :  # 有資料,繼續執行
            descText = mblog['text']
            # print(descText)
            descText = getTextInfo(descText)
            dex = '發表的說說開始:\r\n'
            f.write(dex)
            dex2 = '發表的說說內容:'+descText+'\r\n'
            f.write(dex2)
            print("發表的說說開始:")
            print('發表的說說內容:'+descText)
            textList.append(descText)

            comments = mblog['comments_count']  # 評論數
            numSizeList.append(comments)
            # print(comments)
            # if comments > 1:  # 有評論,獲取到評論連結上的資料
            #     detailLine = datainfo['scheme']
            #     print(detailLine)
            #     detaiList.append(detailLine)

            idstr = mblog['idstr']
            detaiLinks = getpageSize(comments,idstr)
            pagedetail = 1
            for detaillink in detaiLinks:
                jsonData2 = getJsonData(detaillink)
                str11 = '評論詳情頁條目:'+str(pagedetail)+'      .......\r\n'
                f.write(str11)
                print('評論詳情頁條目:'+str(pagedetail)+'      .......')
                print(jsonData2)
                pagedetail = pagedetail +1
                jsonDatadetail = json.loads(jsonData2, 'utf-8')
                listdata = jsonDatadetail['data'] if 'data' in jsonDatadetail else ''
                # print(listdata)
                parseDetailListdata(listdata)
            pagedetail = 1
            print('主頁條目結束...')
            f.write('主頁條目結束...\r\n')
            # detailJsonStr = 'https://m.weibo.cn/api/comments/show?id=' + str(idstr) + '&page=' + str(comments)
            # print(detailJsonStr)
            # commentsList.append(detailJsonStr)
        else:
            # 在裡面的話,直接跳出方法
            return
    print('爬取結束......')


def getTextInfo(textStr):
    # 得到文字內容
    # for textStr in textList:
    # print('***********')
    regx = '<span(.*?)</span>'
    strregx = re.compile(regx)
    strregx = re.findall(strregx, str(textStr))
    replacestr = str(textStr).replace('<span' + ''.join(strregx) + '</span>', '')
    str1 = '<span'

    sstr1 = str(textStr)[0:str(textStr).find(str1)]
    # print(sstr1)
    return sstr1
        # print(textStr)
        # print(replacestr)


# 得到文字詳情頁連結
def getpageSize(comments,idstr):
    for i in range(1,int((comments / 10))+2):
        # 評論也的link
        detaiLink = 'https://m.weibo.cn/api/comments/show?id=' + str(idstr) + '&page=' +str(i)
        detaiLinks.append(detaiLink)
        # print(detaiLink)
        return detaiLinks

# parseJsonData(jsonData)


# print(str(textList))  page = 7
# print(str(detaiList))
f = open('微博京東說說跟評論.txt', 'a',encoding='utf-8')
def main_start():
    for inde in range(11,50):
        # startUrl = 'https://m.weibo.cn/api/container/getIndex?uid=5650743478&luicode=10000011&lfid=100103type%3D1%26q%3D%E4%BA%AC%E4%B8%9C%E5%AE%A2%E6%9C%8D&featurecode=20000320&type=uid&value=5650743478&containerid=1005055650743478&page='+str(inde)

        startUrl = 'https://m.weibo.cn/api/container/getIndex?uid=5650743478&luicode=10000011&lfid=100103type%3D1%26q%[email protected]%E4%BA%AC%E4%B8%9C%E5%AE%A2%E6%9C%8D&featurecode=20000320&type=uid&value=5650743478&containerid=1076035650743478&page={}'+str(inde)
        pageindex = '頁數:'+str(inde)+'\r\n'
        print('startUrl   '+'index '+str(inde)+'     '+startUrl)
        f.write(pageindex)
        data = getJsonData(startUrl)
        parseJsonData(data)
        time.sleep(2)
    f.close()

main_start()

現在暫時可以借用這份程式碼,裡面的url跟cookie換一下,用自己的賬號就可以。另外爬蟲要學會用fiddler等類似的抓包工具,感覺確實是抓包利器。

公司996啊, 加上自己的能力有限,確實現在學習也就到這深度了 以後要多瞭解一下cookie池,代理池之類似的東西。

end