LittlePaimon/Guess_voice/download_data.py

224 lines
7.9 KiB
Python
Raw Normal View History

import json
import os
from pathlib import Path
2022-05-20 18:44:18 +08:00
from bs4 import BeautifulSoup
from littlepaimon_utils import aiorequests
from nonebot import logger
2022-05-20 18:44:18 +08:00
from sqlitedict import SqliteDict # TODO 加入requirements
2022-05-20 18:44:18 +08:00
from .util import get_path
OUT_PUT = Path() / 'data' / 'LittlePaimon' / 'guess_voice' / 'voice'
BASE_URL = 'https://wiki.biligame.com/ys/'
BASE_URL_MYS = 'https://bbs.mihoyo.com'
BASE_URL_MYS_CHARACTERS_LIST = '/ys/obc/channel/map/189/25?bbs_presentation_style=no_header'
API = {'character_list': '角色', 'voice': '%s语音'}
config = {
# 日 英 韩
'voice_language': ['', '', '']
}
dir_data = Path() / 'data' / 'LittlePaimon' / 'guess_voice' / 'data'
dir_data.mkdir(parents=True, exist_ok=True)
############
def init_db(db_dir, db_name='db.sqlite') -> SqliteDict:
2022-05-15 21:12:39 +08:00
return SqliteDict(str(get_path(db_dir, db_name)),
encode=json.dumps,
decode=json.loads,
autocommit=True)
db = init_db('data', 'voice.sqlite')
############
# 获取角色列表
async def get_character_list():
html = await aiorequests.get(url=(BASE_URL + API['character_list']))
soup = BeautifulSoup(html.text, 'lxml')
char_list = soup.find(attrs={
'class': 'resp-tab-content',
'style': 'display:block;'
})
char_list1 = char_list.find_all(attrs={'class': 'g C5星'})
res = list(set(map(lambda x: x.find('div', class_='L').text, char_list1)))
char_list2 = char_list.find_all(attrs={'class': 'g C5'})
res.extend(list(set(map(lambda x: x.find('div', class_='L').text, char_list2))))
char_list3 = char_list.find_all(attrs={'class': 'g C4星'})
res.extend(list(set(map(lambda x: x.find('div', class_='L').text, char_list3))))
res.sort()
return res
# 获取角色语音
async def get_voice_info(character_name: str):
logger.info('获取数据: %s' % character_name)
html = await aiorequests.get(url=(BASE_URL + API['voice'] % character_name))
soup = BeautifulSoup(html.text, 'lxml')
if soup.find(text='本页面目前没有内容。您可以在其他页面中'):
return None
voice_list = soup.find_all(attrs={'class': 'visible-md'})[2:]
info_list = []
for item in voice_list:
item_tab = item.find_all(attrs={'class': ''})[1:]
if isinstance(item_tab[1].next, str):
return info_list
info_list.append({
'title': item_tab[0].text,
'text': item_tab[5].text,
'': item_tab[1].next.attrs.get('data-src', ''),
'': item_tab[2].next.attrs.get('data-src', ''),
'': item_tab[3].next.attrs.get('data-src', ''),
'': item_tab[4].next.attrs.get('data-src', ''),
})
return info_list
# 获取角色语音,通过米游社途径。可获取完整的中日英韩语音。
async def get_voice_info_mys(character_name: str):
character_name = character_name.strip()
logger.info('获取数据: %s' % character_name)
html = await aiorequests.get(url=(BASE_URL_MYS + BASE_URL_MYS_CHARACTERS_LIST))
soup = BeautifulSoup(html.text, 'lxml')
soup_char_container = soup.select('.collection-avatar')[0]
url_char_page = None
for char_soup in soup_char_container.select('.collection-avatar__title'):
if char_soup.text.find(character_name) != -1:
url_char_page = char_soup.parent.attrs.get('href', None)
break
if url_char_page is None:
return None
html = await aiorequests.get(url=(BASE_URL_MYS + url_char_page))
soup = BeautifulSoup(html.text, 'lxml')
soup_voice_languages, soup_voice_lists = soup.select('[data-part="voiceTab"] > ul')
language_tab_indices = {
'': -1,
'': -1,
'': -1,
'': -1
}
for soup_lan in soup_voice_languages.select('li'):
language = soup_lan.text
language_tab_index = int(soup_lan.attrs.get('data-index'))
if language.find('') != -1 or language.find('') != -1:
language_tab_indices[''] = language_tab_index
elif language.find('') != -1:
language_tab_indices[''] = language_tab_index
elif language.find('') != -1:
language_tab_indices[''] = language_tab_index
elif language.find('') != -1:
language_tab_indices[''] = language_tab_index
language_voices = {
'': [],
'': [],
'': [],
'': []
}
for lan, voice_list in language_voices.items():
for soup_row in soup_voice_lists.select(f'li[data-index="{language_tab_indices[lan]}"] > table:nth-of-type(2) > tbody > tr'):
soup_source = soup_row.select('audio > source')
voice_list.append(soup_source[0].attrs.get('src') if len(soup_source) != 0 else '')
info_list = []
soup_title = soup_voice_lists.select('li:first-child > table:nth-of-type(2) > tbody > tr td:nth-child(1)')
soup_text = soup_voice_lists.select('li:first-child > table:nth-of-type(2) > tbody > tr td:nth-child(2) > div > span')
for index in range(len(soup_title)):
info_list.append({
'title': soup_title[index].text.strip(),
'text': soup_text[index].text.strip(),
'': language_voices[''][index],
'': language_voices[''][index],
'': language_voices[''][index],
'': language_voices[''][index],
})
return info_list
# 下载音频文件到本地
async def download(url, path):
res = await aiorequests.get(url=url, timeout=30)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as f:
f.write(res.read())
async def update_voice_data():
# 获取全部人物列表
char_list = await get_character_list()
for char in char_list:
info = await get_voice_info_mys(char)
if not info:
continue
data = []
for v in info:
chn = ''
jap = ''
eng = ''
kor = ''
for language in config['voice_language']:
url = v[language]
if not url:
continue
path = str(Path() / language / char / Path(url).name)
out_path = OUT_PUT / path
out = str(out_path)
if not out_path.exists():
await download(url, out)
if language == '':
chn = path
elif language == '':
jap = path
elif language == '':
eng = path
elif language == '':
kor = path
logger.info('下载成功: %s -> %s' % (char, path))
data.append({
'title': v['title'],
'text': v['text'],
'chn': chn,
'jap': jap,
'eng': eng,
'kor': kor
})
# 存入数据库
db[char] = data
async def voice_list_by_mys():
url = 'https://api-static.mihoyo.com/common/blackboard/ys_obc/v1/home/content/list?app_sn=ys_obc&channel_id=84'
resp = await aiorequests.get(url=url, timeout=30)
json_data = resp.json()
if json_data['retcode']:
raise Exception(json_data['message'])
try:
data_list = json_data['data']['list'][0]['list']
except KeyError as e:
raise Exception('获取语音列表失败, 请联系作者修复')
return {x['title'].split()[0]: x for x in data_list}
async def voice_detail_by_mys(content_id):
url = 'https://bbs.mihoyo.com/ys/obc/content/%s/detail?bbs_presentation_style=no_header' % content_id
res = await aiorequests.get(url=url, timeout=30)
soup = BeautifulSoup(res.text, 'lxml')
paragraph_box = soup.select('.obc-tmpl__paragraph-box')
return [{
'text': x.get_text(),
'chn': x.find('source').attrs['src']
} for x in paragraph_box]