scrapy爬蟲和Django後臺結合(爬取酷我音樂)
阿新 • • 發佈:2019-01-09
程式碼結構:
Spider/spider/kuwo.py爬蟲程式碼:
# -*- coding: utf-8 -*- import scrapy import demjson import re import os from ..items import MusicItem, SingerItem from bloomfilter import Bloomfilter #布隆過濾 class KuwoSpider(scrapy.Spider): name = 'kuwo' allowed_domains = ['kuwo.cn'] start_urls = [ 'http://artistlistinfo.kuwo.cn/mb.slist?stype=artistlist&category=0&order=dict&pn=0&rn=100&encoding=utf8&prefix=' ] def __init__(self, name=None, **kwargs): super(KuwoSpider, self).__init__(name=name, kwargs=kwargs) if not os.path.exists("singer.state"): self.bloom = Bloomfilter(10000000) else: # 儲存狀態檔案字尾隨便寫 self.bloom = Bloomfilter("singer.state") def start_requests(self): for x in [chr(code) for code in range(97, 123)]: url = self.start_urls[0] + x yield scrapy.Request( url=url, callback=self.parse, dont_filter=True, meta={'prefix': x} ) def parse(self, response): meta = response.meta json_obj = demjson.decode(response.text) total = json_obj.get("total", "0") total = int(total) if total.isdigit() else 0 rn = json_obj.get("rn", "100") rn = int(rn) if rn.isdigit() else 100 total_page = total//rn if total % rn == 0 else total//rn+1 # 處理資料並存儲 artistlist = json_obj.get('artistlist', []) for artist in artistlist: pic = artist.get('pic') if not self.bloom.test(pic): item = SingerItem() item['singer_id'] = artist.get("id") url = "http://search.kuwo.cn/r.s?stype=artist2music&artistid={}&pn=0&rn=100&sortby=0&show_copyright_off=1&alflac=1&pcmp4=1&encoding=utf8&vipver=MUSIC_8.7.7.0_PQ&plat=pc&devid=51016591&thost=search.kuwo.cn".format(item['singer_id']) yield scrapy.Request( url=url, callback=self.parse_music, dont_filter=True, ) item['singer_name'] = artist.get("name") item['singer_music_num'] = artist.get("music_num") item['singer_listen'] = artist.get("listen") item['singer_like'] = artist.get("like") item['singer_pic'] = pic # pic_list = pic.split("/")[:-1] # pic_path = "../imgs/" + "/".join(pic_list) # # if not os.path.exists(pic_path): # # os.makedirs(pic_path) # os.makedirs(pic_path, exist_ok=True) item['singer_aartist'] = artist.get("AARTIST") item['singer_isstar'] = artist.get("isstar") item['singer_prefix'] = response.meta.get("prefix") yield item self.bloom.add(pic) # 資料的持久化 self.bloom.save("singer.state") else: print("資料已經存在") pattern = re.compile(r"pn=(\d+)") pn = pattern.findall(response.url) pn = pn[0] if pn else 0 pn = int(pn) pn += 1 pattern = re.compile(r"pn=\d+") url = pattern.sub("pn={}".format(pn), response.url) print("---------------", url) if pn < total_page: yield scrapy.Request( url=url, callback=self.parse, dont_filter=True, meta=meta ) def parse_music(self, response): json_obj = demjson.decode(response.text) for music in json_obj.get('musiclist', []): item = MusicItem() item['music_musicrid'] = music.get("musicrid") item['music_name'] = music.get("name") item['music_artist'] = music.get("artist") item['music_releasedate'] = music.get("releasedate") item['music_artistid'] = music.get("artistid") item['music_alnumind'] = music.get("albumid") item['music_album'] = music.get("album") yield item
items.py 程式碼
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy_djangoitem import DjangoItem from api.models import Singer, Music class SingerItem(DjangoItem): django_model = Singer class MusicItem(DjangoItem): django_model = Music
Spider/spider/pipelines.py程式碼(儲存爬取的資料)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class SpiderPipeline(object): def process_item(self, item, spider): return item from scrapy.pipelines.images import ImagesPipeline, FilesPipeline from scrapy.http import Request from urllib.parse import urljoin from .items import SingerItem, MusicItem class MyImagesPipeline(FilesPipeline): def get_media_requests(self, item, info): if 'singer_pic' in dict(item): pic = item['singer_pic'] pic = urljoin("http://img1.sycdn.kuwo.cn/star/starheads/", pic) return [Request(pic, meta={ 'path': item['singer_pic'] })] else: music_id = item['music_musicrid'] music_src ="http://antiserver.kuwo.cn/anti.s?rid=MUSIC_{}&format=aac|mp3&type=convert_url&response=url".format(music_id) return [ Request(music_src, meta={ 'path': item['music_musicrid'] + '.acc' }) ] def file_path(self, request, response=None, info=None): path = request.meta.get('path') if '/' in path: return "../imgs/" + path else: return "../musics/" + path def item_completed(self, results, item, info): print('=====', results) if 'singer_pic' in dict(item): pass else: status, value = results[0] if results else (0, {'path': '沒有路徑'}) item['music_src'] = value.get('path').replace("../", "") item['music_lrc_src'] = "" item.save() return item
settings.py程式碼:
ITEM_PIPELINES = {
'Spider.pipelines.SpiderPipeline': 300,
'Spider.pipelines.MyImagesPipeline': 300,
}
FILES_URLS_FIELD = "singer_pic"
FILES_STORE = "../imgs/"
common/orm2json.py 程式碼:
對資料進行json序列化
from django.db.models.query import QuerySet
import datetime
def object_to_json(model, ignore=None):
if ignore is None:
ignore = []
if type(model) in [QuerySet, list]:
json = []
for element in model:
json.append(_django_single_object_to_json(element, ignore))
return json
else:
return _django_single_object_to_json(model, ignore)
def _django_single_object_to_json(element, ignore=None):
return dict([(attr, getattr(element, attr)) for attr in [f.name for f in element._meta.fields]])
api/models.py程式碼:
from django.db import models
class Singer(models.Model):
singer_id = models.IntegerField()
singer_name = models.CharField(max_length=200)
singer_music_num = models.IntegerField()
singer_listen = models.IntegerField()
singer_like = models.IntegerField()
singer_pic = models.CharField(max_length=200)
singer_aartist = models.CharField(max_length=200)
singer_isstar = models.IntegerField()
singer_prefix = models.CharField(max_length=200, default='')
singer_ishot = models.BooleanField(default=False)
class Music(models.Model):
music_musicrid = models.IntegerField()
music_name = models.CharField(max_length=200)
music_artist = models.CharField(max_length=200)
music_releasedate = models.CharField(max_length=200)
music_artistid = models.IntegerField()
music_album = models.CharField(max_length=200)
music_alnumind = models.IntegerField()
# 本地地址,不是遠端地址
music_src = models.CharField(max_length=200)
music_lrc_src = models.CharField(max_length=200)
在KuWO寫Django的配置檔案settings.py 中需要做一下配置:
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql',
# 資料庫名稱
'NAME': 'kuwodb',
'USER': 'root',
'PASSWORD': '123456',
'POST': '12.0.0.1',
'PORT': 3306
}
}
# redis資料庫配置
CACHES = {
'default': {
'BACKEND': 'django_redis.cache.RedisCache',
'LOCATION': 'redis://127.0.0.1:6379',
"OPTIONS": {
"CLIENT_CLASS": "django_redis.client.DefaultClient",
},
},
}
common/middleware.py 中自定義中介軟體的配置
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
# 這是自定義的中介軟體,至少要放在SessionMiddleware之後
'common.middleware.MyCustomMiddleware',
# 用於過濾UserAgent的中介軟體
'common.middleware.BadUserAgentMiddleware',
# 用於過濾哪些ip可以訪問系統的中介軟體
'common.middleware.GoodIpMiddlleware',
# # 判定cookie中是否有指定欄位
# 'common.middleware.BadCookieMiddleware',
# 限定ip訪問次數的中介軟體
'common.middleware.SlowSpeedMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
common/middleware.py程式碼:
設定安全問題
import re
from django.conf import settings
from django.http import HttpResponsePermanentRedirect, HttpResponseBadRequest, HttpResponseForbidden, HttpResponseNotFound
from django.utils.deprecation import MiddlewareMixin
import time
import demjson
class MyCustomMiddleware(MiddlewareMixin):
def process_request(self, request):
print('有人來訪問後臺了')
def process_response(self, request, response):
return response
class BadUserAgentMiddleware(MyCustomMiddleware):
def process_request(self, request):
user_agent = request.META.get('HTTP_USER_AGENT', '')
print(user_agent)
if not user_agent:
return HttpResponseBadRequest(content='你是一個爬蟲吧?')
if 'python' in user_agent or 'requests' in user_agent or 'scrapy' in user_agent:
return HttpResponseBadRequest(content='你是一個框架寫的爬蟲吧')
class GoodIpMiddlleware(MiddlewareMixin):
def process_ruquest(self, request):
ip = request.META.get('REMOVE_ADDR')
if ip in ['127.0.0.1', 'localhost']:
return HttpResponseForbidden(content='你的ip禁止訪問該系統!!!')
class BadCookieMiddleware(MiddlewareMixin):
def process_request(self, request):
cookies = request.COOKIES
if 'my_name' not in cookies:
return HttpResponseBadRequest(content='不是一個好cookie')
VISIT_TOTAL_TIME = 60
VISIT_PER_SECOND = 10
AllOW = {}
class SlowSpeedMiddleware(MiddlewareMixin):
ip = '1.1.1.1'
def process_request(self, request):
ctime = time.time()
ip = self.ip
if ip not in AllOW:
AllOW[ip] = [ctime, ]
else:
time_list = AllOW[ip]
while True:
last_time = time_list[-1] if time_list else None
if not last_time:
break
if ctime - VISIT_TOTAL_TIME > last_time:
time_list.pop()
else:
break
if len(AllOW[ip]) > VISIT_PER_SECOND:
error_msg = {
'msg': '訪問頻率太快啦!限制你的{}!{}秒後再試!!!'.format(ip, self.wait())
}
return HttpResponseNotFound(content=demjson.encode(error_msg), content_type='application/json')
AllOW[ip].insert(0, ctime)
def wait(self):
ip = self.ip
ctime = time.time()
first_in_time = AllOW[ip][-1]
wt = VISIT_TOTAL_TIME - (ctime - first_in_time)
return int(wt)
然後在Terminal中執行命令進行資料遷移,建立資料表,在此之前要先把資料庫建好
python manage.py makemigrations
python manage.py migrate
STATICFILES_DIRS = [
os.path.join(BASE_DIR, "static"),
os.path.join(BASE_DIR, "imgs"),
]
CACHES = {
'default': {
'BACKEND': 'django_redis.cache.RedisCache',
'LOCATION': 'redis://127.0.0.1:6379',
"OPTIONS": {
"CLIENT_CLASS": "django_redis.client.DefaultClient",
},
},
}
api/views.py 程式碼:
從資料庫查詢資料,若資料庫中沒有則從網頁上請求,然後快取到redis中(程式執行時要開啟redis服務)
from django.shortcuts import render
from django.http import JsonResponse
from common.decorate import api_json
from .models import Singer, Music
from common.orm2json import object_to_json
from django.db import connection
from django.core.cache import cache
PAGE_SIZE = 10
def index(request):
dict1 = {
'msg': 'hello world'
}
return JsonResponse(dict1)
def get_singers(request):
msg = '查詢成功'
status = 5
# 網頁需要給的引數:分頁page 每頁大小size 名字首字母cname
page = request.GET.get('page', '1')
page = int(page) if page.isdigit() and '-' not in page and page != '0' else 1
size = request.GET.get('size', str(PAGE_SIZE))
size = int(size) if size.isdigit() and '-' not in size and size != '0' else PAGE_SIZE
cname = request.GET.get('cname', 'hot')
cname = cname.lower()
if cname != 'hot':
cname = cname[0] if cname and cname in get_cnames(request) else 'a'
key = "singer_{}".format(cname)
if cache.has_key(key):
print('從快取中讀取', key)
singers = cache.get(key)
else:
singers = Singer.objects.filter(singer_prefix=cname)
cache.set(key, singers, 60)
else:
# 返回hot的資料
key = "singer_{}".format(cname)
singers = get_hot(request)
cache.set(key, singers, 60)
singer_total = len(singers)
if size > PAGE_SIZE:
status = False
msg = '頁碼超過指定範圍'
size = PAGE_SIZE
singers = singers[(page-1)*size: page*size]
current_page = page
total_page = singer_total // size if singer_total % size == 0 else singer_total // size + 1
page_size = size
# orm -> dict
pages = []
if page > total_page:
msg = '超過總頁數'
status = False
singers = object_to_json(singers)
return_dict = {
'msg': msg,
'status': status,
'singer_total': singer_total,
'current_page': current_page,
'total_page': total_page,
'page_size': page_size,
'cname': cname,
'pages': pages,
'singers': singers,
}
return return_dict
# 後臺需要返回的:total current_page total_page size cname singers
def get_hot(request):
rows = Singer.objects.order_by('singer_listen').reverse()
return rows
def get_cnames(request):
cursor = connection.cursor()
cursor.execute("select singer_prefix from api_singer group by singer_prefix")
rows = cursor.fetchall()
return rows
def get_music_by_singer_id(request):
msg = '查詢成功'
status = 5
singer_id = request.GET.get('singer_id')
singer_id = int(singer_id) if singer_id and singer_id.isdigit() and '-' not in singer_id else 0
page = request.GET.get('page', '1')
page = int(page) if page.isdigit() and '-' not in page and page != '0' else 1
size = request.GET.get('size', str(PAGE_SIZE))
size = int(size) if size.isdigit() and '-' not in size and size != '0' else PAGE_SIZE
key = "music_{}".format(singer_id)
if cache.has_key(key):
print('從快取中讀取', key)
musics = cache.get(key)
else:
musics = Music.objects.filter(music_artistid=singer_id)
cache.set(key, musics, 60)
musics = object_to_json(musics)
music_total = len(musics)
if size > PAGE_SIZE:
status = False
msg = '頁碼超過指定範圍'
size = PAGE_SIZE
musics = musics[(page-1)*size: page*size]
current_page = page
total_page = music_total // size if music_total % size == 0 else music_total // size + 1
page_size = size
# orm -> dict
pages = []
if page > total_page:
msg = '超過總頁數'
status = False
return_dict = {
'msg': msg,
'status': status,
'music_total': music_total,
'current_page': current_page,
'total_page': total_page,
'page_size': page_size,
'pages': pages,
'musics': musics,
}
return return_dict
def get_music_src_by_music_id(request):
pass
def get_lrc_src_by_music_id(request):
pass
@api_json
def singers(request):
return get_singers(request)
@api_json
def musics(request):
return get_music_by_singer_id(request)
common/decorate.py程式碼:
from django.http import JsonResponse
from functools import wraps
def api_json(func):
@wraps(func)
def _func(*args, **kwargs):
json_obj = func(*args, **kwargs)
return JsonResponse(json_obj)
# return json_obj
return _func
# 測試程式碼執行,在被匯入其他檔案中時下面程式碼不會起作用
if __name__ == '__main__':
@api_json
def hello():
return {'name': 'zhangsan'}
print(hello())
在路由中配置路徑:(KuWo/urls.py)
from django.contrib import admin
from django.urls import path
from api import views as api_views
urlpatterns = [
path('admin/', admin.site.urls),
path('', api_views.index),
path('singers/', api_views.singers),
path('musics/', api_views.musics),
]