爬取网易云音乐歌单里面的歌曲 | 臭大佬

臭大佬 2020-03-29 23:54:21 1878
Python 
简介 爬取网易云音乐歌单里面的歌曲

数据表

CREATE TABLE `song_list` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '歌单名',
  `link` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '链接',
  `cover` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '略缩图',
  `favorite` int(11) NOT NULL DEFAULT '0' COMMENT '收藏数',
  `share` int(255) NOT NULL DEFAULT '0' COMMENT '分享数',
  `author` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '作者',
  `comment` int(11) NOT NULL DEFAULT '0' COMMENT '评论数',
  `play_num` int(11) NOT NULL DEFAULT '0' COMMENT '播放次数',
  `song_num` int(11) DEFAULT '0' COMMENT '歌曲数',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1324 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE `song` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `cate_id` int(11) NOT NULL DEFAULT '0' COMMENT '所属分类',
  `title` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '标题',
  `link` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '链接',
  `duration` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '时长',
  `singer` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '歌手',
  `album` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '专辑',
  PRIMARY KEY (`id`),
  KEY `cate_id` (`cate_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='歌曲表';

操作说明

我们在上一篇的基础上,给song_list表增加了多个字段,这样有利于统计数据,
如果song_list表没有数据的,需要先运行截图部分注释掉的代码

把本篇的代码改成如下,

if __name__ == '__main__':
    # 存储歌单
     limit = 35
     for p in range(0, 38):
         offset = limit * p
         get_song_list(limit, offset)
    # 存储音乐
    #get_song()

运行完成后,再改回最终代码。

代码

# coding:utf-8
from bs4 import BeautifulSoup
import time
import pymysql
from selenium import webdriver
import re

# 引入 selenium  和实例化一个浏览器引擎
DRIVER = webdriver.Chrome(executable_path=r"./chromedriver.exe")
# 链接数据库
CON = pymysql.connect(host='localhost', port=3306, user='root', password='root', db='python_test',
                      charset='utf8mb4')
# 得到一个可以执行SQL语句的光标对象
CUR = CON.cursor()

BASEURL = 'https://music.163.com'

# 数据库操作
def db_write(sql):
    print(sql)
    try:
        CUR.execute(sql)
        CON.commit()
        print("数据写入成功")
    except Exception as e:
        print(e)


# 获取歌单
def get_song_list(limit=35, offset=0):
    reqUrl = BASEURL + '/#/discover/playlist/?order=hot&cat=全部&limit=' + str(limit) + '&offset=' + str(offset)
    # 打开一个网页
    DRIVER.get(reqUrl)
    # 使用selenium切换frame
    DRIVER.switch_to.frame("g_iframe")
    req = DRIVER.page_source.encode('utf-8')
    soup = BeautifulSoup(req, 'lxml')
    songs = soup.select("p.dec > a")
    for song in songs:
        title = song.get('title')
        link = BASEURL + song.get('href')
        sql = "insert into song_list(title,link)values('{title}','{link}')".format(title=title, link=link)
        db_write(sql)


# 获取歌曲
def get_song():
    # 查询数据
    CUR.execute('select link,id from song_list')
    links = CUR.fetchall()
    # 遍历
    for link in links:
        link_url = link[0]
        link_id = link[1]
        DRIVER.get(link_url)
        # 等三秒,让页面加载完成
        time.sleep(3)
        # 使用selenium切换frame
        DRIVER.switch_to.frame("g_iframe")
        time.sleep(0.5)
        cover = DRIVER.find_element_by_css_selector('.cover > img').get_attribute('src')
        favorite = DRIVER.find_element_by_css_selector('.u-btni-fav > i').text
        share = DRIVER.find_element_by_css_selector('.u-btni-share > i').text
        # 正则去掉()
        favorite = re.sub(r'[()]', '', favorite)
        share = re.sub(r'[()]', '', share)
        author = DRIVER.find_element_by_css_selector('.name > a').text
        comment = DRIVER.find_element_by_css_selector('#cnt_comment_count').text
        play_num = DRIVER.find_element_by_css_selector('#play-count').text
        song_num = DRIVER.find_element_by_css_selector('#playlist-track-count').text
        # 完善歌单表数据
        song_list_sql = "UPDATE song_list SET cover='{cover}', favorite='{favorite}', share='{share}', author='{author}', comment='{comment}', play_num='{play_num}', song_num='{song_num}' where link='{link}'".format(
            cover=cover, favorite=favorite, share=share, author=author, comment=comment, play_num=play_num,
            song_num=song_num, link=link_url)
        db_write(song_list_sql)
        # 存储歌曲
        songs_tr = DRIVER.find_elements_by_xpath('//table/tbody/tr')
        for tr in songs_tr:
            song_title = tr.find_element_by_css_selector(
                "td:nth-child(2) > div > div > div > span > a > b").get_attribute('title')
            song_link = tr.find_element_by_css_selector("td:nth-child(2) > div > div > div > span > a").get_attribute(
                'href')
            song_duration = tr.find_element_by_css_selector("td:nth-child(3) > span").text
            song_singer = tr.find_element_by_css_selector("td:nth-child(4) > div").get_attribute('title')
            song_album = tr.find_element_by_css_selector("td:nth-child(5) > div > a").get_attribute('title')
            song_sql = "insert into song(cate_id,title,link,duration,singer,album)values('{cate_id}','{title}','{link}','{duration}','{singer}','{album}')".format(
                cate_id=link_id, title=song_title, link=song_link, duration=song_duration, singer=song_singer,
                album=song_album)
            db_write(song_sql)
    # 操作完成,关闭当前页
    DRIVER.close()


if __name__ == '__main__':
    # 存储歌单
    # limit = 35
    # for p in range(0, 38):
    #     offset = limit * p
    #     get_song_list(limit, offset)
    # 存储音乐
    get_song()

结果