463 lines
15 KiB
Python
463 lines
15 KiB
Python
|
|
import time
|
|||
|
|
import shutil
|
|||
|
|
import requests
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
import html
|
|||
|
|
import tempfile
|
|||
|
|
import asyncio
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Optional
|
|||
|
|
import hashlib
|
|||
|
|
from nonebot import on_message, logger
|
|||
|
|
from nonebot.rule import to_me
|
|||
|
|
from nonebot.adapters import Event
|
|||
|
|
|
|||
|
|
from yt_dlp import YoutubeDL
|
|||
|
|
from httpx import AsyncClient
|
|||
|
|
import msgspec
|
|||
|
|
|
|||
|
|
from .minio import upload_to_s3
|
|||
|
|
|
|||
|
|
# 匹配链接的正则
|
|||
|
|
URL_PATTERN = re.compile(r"(https?://[^\s]+)")
|
|||
|
|
pattern = r"QQ小程序(?:]|]|\])"
|
|||
|
|
|
|||
|
|
# 保留原有平台识别,同时加入抖音相关域名
|
|||
|
|
VALID_HOSTS = [
|
|||
|
|
"b23.tv",
|
|||
|
|
"bilibili.com",
|
|||
|
|
"youtube.com",
|
|||
|
|
"youtu.be",
|
|||
|
|
# douyin
|
|||
|
|
"douyin.com",
|
|||
|
|
"v.douyin.com",
|
|||
|
|
"iesdouyin.com",
|
|||
|
|
"m.douyin.com",
|
|||
|
|
"jingxuan.douyin.com",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 抖音短链识别
|
|||
|
|
SHORT_LINK_PATTERN = re.compile(r"(v\.douyin\.com/[A-Za-z0-9_\-]+)")
|
|||
|
|
# 从 HTML 提取路由数据
|
|||
|
|
DOUYIN_ROUTER_PATTERN = re.compile(r"window\._ROUTER_DATA\s*=\s*(.*?)</script>", re.DOTALL)
|
|||
|
|
|
|||
|
|
video_handler = on_message(priority=10, block=False, rule=to_me())
|
|||
|
|
|
|||
|
|
|
|||
|
|
@video_handler.handle()
|
|||
|
|
async def handle_video_download(event: Event):
|
|||
|
|
msg = str(event.get_message()).strip()
|
|||
|
|
logger.info(f"获取到的消息:{msg}")
|
|||
|
|
target_url: Optional[str] = None
|
|||
|
|
|
|||
|
|
# 提取 URL
|
|||
|
|
if not re.search(pattern, msg):
|
|||
|
|
urls = URL_PATTERN.findall(msg)
|
|||
|
|
for u in urls:
|
|||
|
|
logger.info(f"链接:{u}")
|
|||
|
|
|
|||
|
|
# 优先处理抖音(单独解析以获取直链)
|
|||
|
|
if "douyin.com" in u or "v.douyin.com" in u or "iesdouyin.com" in u:
|
|||
|
|
await video_handler.send("检测到链接,正在尝试下载视频,请稍候...")
|
|||
|
|
try:
|
|||
|
|
parsed_path = await parse_douyin(u)
|
|||
|
|
if parsed_path:
|
|||
|
|
# 抖音直接获取直链并且合并文件后返回文件地址后直接上传s3
|
|||
|
|
logger.info(f"文件路径:{parsed_path}")
|
|||
|
|
# 构建path对象
|
|||
|
|
await video_handler.send(f"视频:{parsed_path.name} 已缓存\n正在生成下载链接,请耐心等待!")
|
|||
|
|
s3_url = upload_to_s3(parsed_path)
|
|||
|
|
await video_handler.send(f"{s3_url}")
|
|||
|
|
break
|
|||
|
|
else:
|
|||
|
|
await video_handler.send(f"抖音解析失败或未找到直链:{u}")
|
|||
|
|
logger.warning(f"抖音解析失败或未找到直链:{u}")
|
|||
|
|
# 如果解析失败,继续尝试其他链接
|
|||
|
|
continue
|
|||
|
|
except Exception as e:
|
|||
|
|
await video_handler.send(f"解析抖音链接时出错:{e}")
|
|||
|
|
logger.exception(f"解析抖音链接时出错:{e}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 非抖音走原逻辑
|
|||
|
|
if any(domain in u for domain in VALID_HOSTS):
|
|||
|
|
target_url = u
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if not target_url:
|
|||
|
|
return # 不处理无效链接
|
|||
|
|
|
|||
|
|
await video_handler.send("检测到链接,正在尝试下载视频,请稍候...")
|
|||
|
|
else:
|
|||
|
|
target_url = None
|
|||
|
|
logger.info("检测到小程序")
|
|||
|
|
url = await proc_xcx(msg)
|
|||
|
|
logger.info(f"链接:{url}")
|
|||
|
|
if url and any(domain in url for domain in VALID_HOSTS):
|
|||
|
|
target_url = url
|
|||
|
|
|
|||
|
|
if not target_url:
|
|||
|
|
return # 不处理无效链接
|
|||
|
|
await video_handler.send("检测到视频,正在尝试下载视频,请稍候...")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
video_file = await download_video(target_url)
|
|||
|
|
if not video_file:
|
|||
|
|
await video_handler.send("视频下载失败。")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
await video_handler.send(f"视频已缓存:{video_file.name}\n正在生成下载链接,请耐心等待!")
|
|||
|
|
logger.info(f"文件路径:{video_file}")
|
|||
|
|
s3_url = upload_to_s3(video_file)
|
|||
|
|
logger.info(f"文件上传完成,返回链接:{s3_url}")
|
|||
|
|
await video_handler.send(f"{s3_url}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception(e)
|
|||
|
|
await video_handler.send("下载过程中出现错误。")
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def download_video(url: str) -> Optional[Path]:
|
|||
|
|
"""
|
|||
|
|
如果 url 看起来是直接的媒体直链(如 mp4),则使用 httpx 直接下载;
|
|||
|
|
否则回退到 yt-dlp 下载(兼容多平台),并支持:
|
|||
|
|
1) Firefox cookies
|
|||
|
|
2) cookies.txt 兜底
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
# ---------- 1. 直链探测 ----------
|
|||
|
|
direct_media_ext = re.search(r"\.(mp4|m3u8|ts|webm|mov|flv)(?:$|\?)", url, re.IGNORECASE)
|
|||
|
|
is_direct = bool(direct_media_ext)
|
|||
|
|
|
|||
|
|
if not is_direct:
|
|||
|
|
try:
|
|||
|
|
async with AsyncClient(follow_redirects=True, timeout=30) as client:
|
|||
|
|
head = await client.head(url, follow_redirects=True)
|
|||
|
|
ctype = head.headers.get("content-type", "")
|
|||
|
|
if ctype.startswith("video/") or "application/octet-stream" in ctype:
|
|||
|
|
is_direct = True
|
|||
|
|
except Exception:
|
|||
|
|
is_direct = False
|
|||
|
|
|
|||
|
|
if is_direct:
|
|||
|
|
temp_dir = tempfile.mkdtemp(prefix="direct_ytcache_")
|
|||
|
|
ext = "mp4"
|
|||
|
|
m = re.search(r"\.([a-zA-Z0-9]{2,5})(?:$|\?)", url)
|
|||
|
|
if m and len(m.group(1)) <= 5:
|
|||
|
|
ext = m.group(1)
|
|||
|
|
|
|||
|
|
filename = os.path.join(temp_dir, f"downloaded_video.{ext}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
async with AsyncClient(follow_redirects=True, timeout=120) as client:
|
|||
|
|
async with client.stream("GET", url) as resp:
|
|||
|
|
resp.raise_for_status()
|
|||
|
|
with open(filename, "wb") as fh:
|
|||
|
|
async for chunk in resp.aiter_bytes(chunk_size=8192):
|
|||
|
|
fh.write(chunk)
|
|||
|
|
|
|||
|
|
p = Path(filename)
|
|||
|
|
new_path = p.with_name(clean_filename(p.name))
|
|||
|
|
p.rename(new_path)
|
|||
|
|
logger.info(f"直接下载完成: {new_path}")
|
|||
|
|
return new_path
|
|||
|
|
|
|||
|
|
except Exception:
|
|||
|
|
logger.exception("直接下载失败,回退 yt-dlp")
|
|||
|
|
try:
|
|||
|
|
if os.path.exists(filename):
|
|||
|
|
os.remove(filename)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# ---------- 2. yt-dlp 下载 ----------
|
|||
|
|
temp_dir = tempfile.mkdtemp(prefix="ytcache_")
|
|||
|
|
output_path = os.path.join(temp_dir, "%(title).80s.%(ext)s")
|
|||
|
|
|
|||
|
|
base_opts = {
|
|||
|
|
"outtmpl": output_path,
|
|||
|
|
"format": "bestvideo+bestaudio/best",
|
|||
|
|
"noplaylist": True,
|
|||
|
|
"quiet": True,
|
|||
|
|
"ffmpeg_location": get_ffmpeg_path(),
|
|||
|
|
"extractor_args": {
|
|||
|
|
"youtube": {
|
|||
|
|
"player_client": ["android"]
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
loop = asyncio.get_event_loop()
|
|||
|
|
|
|||
|
|
def _run_yt(opts: dict):
|
|||
|
|
with YoutubeDL(opts) as ydl:
|
|||
|
|
ydl.download([url])
|
|||
|
|
|
|||
|
|
# ---------- 2.1 优先:Firefox cookies ----------
|
|||
|
|
try:
|
|||
|
|
opts = dict(base_opts)
|
|||
|
|
opts["cookiesfrombrowser"] = ("firefox",)
|
|||
|
|
logger.info("尝试使用 Firefox cookies 下载")
|
|||
|
|
await loop.run_in_executor(None, _run_yt, opts)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"Firefox cookies 下载失败,尝试 cookies.txt:{e}")
|
|||
|
|
|
|||
|
|
# ---------- 2.2 回退:cookies.txt ----------
|
|||
|
|
cookie_path = Path(__file__).resolve().parent / "cookies.txt"
|
|||
|
|
if cookie_path.exists():
|
|||
|
|
opts = dict(base_opts)
|
|||
|
|
opts["cookiefile"] = str(cookie_path)
|
|||
|
|
logger.info(f"使用 cookies 文件: {cookie_path}")
|
|||
|
|
await loop.run_in_executor(None, _run_yt, opts)
|
|||
|
|
else:
|
|||
|
|
logger.error("未找到 cookies.txt,无法通过登录验证")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
# ---------- 3. 结果处理 ----------
|
|||
|
|
files = list(Path(temp_dir).glob("*.*"))
|
|||
|
|
if not files:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
original_file = files[0]
|
|||
|
|
new_path = original_file.with_name(clean_filename(original_file.name))
|
|||
|
|
original_file.rename(new_path)
|
|||
|
|
logger.info(f"下载完成并重命名: {new_path}")
|
|||
|
|
|
|||
|
|
return new_path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_ffmpeg_path() -> str:
|
|||
|
|
scripts_dir = os.path.dirname(sys.executable)
|
|||
|
|
ffmpeg_path = os.path.join(scripts_dir, "ffmpeg.exe")
|
|||
|
|
if os.path.exists(ffmpeg_path):
|
|||
|
|
return ffmpeg_path
|
|||
|
|
return "ffmpeg"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_filename(filename: str) -> str:
|
|||
|
|
return (
|
|||
|
|
filename.replace(" ", "_")
|
|||
|
|
.replace("&", "_")
|
|||
|
|
.replace("#", "_")
|
|||
|
|
.replace("'", "")
|
|||
|
|
.replace('"', "")
|
|||
|
|
.replace("?", "")
|
|||
|
|
.replace(":", "_")
|
|||
|
|
.replace("|", "_")
|
|||
|
|
.replace("/", "_")
|
|||
|
|
.replace("\\", "_")
|
|||
|
|
.replace("*", "_")
|
|||
|
|
.replace("<", "_")
|
|||
|
|
.replace(">", "_")
|
|||
|
|
.replace("【", "")
|
|||
|
|
.replace("】", "")
|
|||
|
|
.replace(":", "")
|
|||
|
|
.replace("。", "")
|
|||
|
|
.replace(",", "_")
|
|||
|
|
.replace("《", "")
|
|||
|
|
.replace("》", "")
|
|||
|
|
.replace("?", "_")
|
|||
|
|
.replace("|", "")
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def proc_xcx(msg):
|
|||
|
|
p_pattern = r'"qqdocurl":"(.*?)"'
|
|||
|
|
match = re.search(p_pattern, msg)
|
|||
|
|
|
|||
|
|
if match:
|
|||
|
|
raw_url = match.group(1)
|
|||
|
|
unescaped = html.unescape(raw_url)
|
|||
|
|
cleaned_url = unescaped.replace(r"\\/", "/")
|
|||
|
|
base_url = cleaned_url.split("?")[0]
|
|||
|
|
print("最终提取链接:", base_url)
|
|||
|
|
return base_url
|
|||
|
|
else:
|
|||
|
|
print("未找到 qqdocurl 字段")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ----------------- Douyin parsing helpers -----------------
|
|||
|
|
|
|||
|
|
async def parse_douyin(url: str) -> Optional[str]:
|
|||
|
|
"""
|
|||
|
|
解析抖音链接,返回可直接下载的媒体直链(优先)或 None。
|
|||
|
|
支持短链(v.douyin.com)、长链、iesdouyin、m.douyin 等。
|
|||
|
|
"""
|
|||
|
|
# 处理短链
|
|||
|
|
short = SHORT_LINK_PATTERN.search(url)
|
|||
|
|
if short:
|
|||
|
|
short_url = "https://" + short.group(1)
|
|||
|
|
try:
|
|||
|
|
cookies_path = os.path.dirname(__file__).replace("\\", "/") + "/cookies.txt"
|
|||
|
|
file_path = await fetch_douyin_mp4(short_url, cookies_path, 10)
|
|||
|
|
return file_path
|
|||
|
|
except Exception:
|
|||
|
|
logger.exception("获取直链失败")
|
|||
|
|
# 仍然尝试原始 url
|
|||
|
|
|
|||
|
|
# 现在尝试从页面抓取 router data
|
|||
|
|
try:
|
|||
|
|
logger.info(f"获取到的信息:{url}")
|
|||
|
|
except Exception:
|
|||
|
|
logger.exception("抖音页面解析失败")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import tempfile
|
|||
|
|
import aiohttp
|
|||
|
|
import os
|
|||
|
|
from typing import List, Dict, Optional
|
|||
|
|
|
|||
|
|
from playwright.async_api import async_playwright
|
|||
|
|
import ffmpeg
|
|||
|
|
|
|||
|
|
|
|||
|
|
class DouyinFetchError(Exception):
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_netscape_cookies(cookies_txt_path: str) -> List[Dict]:
|
|||
|
|
"""
|
|||
|
|
解析 Netscape HTTP Cookie File -> Playwright cookies
|
|||
|
|
"""
|
|||
|
|
cookies: List[Dict] = []
|
|||
|
|
|
|||
|
|
with open(cookies_txt_path, "r", encoding="utf-8") as f:
|
|||
|
|
for line in f:
|
|||
|
|
line = line.strip()
|
|||
|
|
if not line or line.startswith("#"):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
parts = line.split("\t")
|
|||
|
|
if len(parts) != 7:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
domain, include_sub, path, secure, expiry, name, value = parts
|
|||
|
|
|
|||
|
|
cookie = {
|
|||
|
|
"name": name,
|
|||
|
|
"value": value,
|
|||
|
|
"domain": domain,
|
|||
|
|
"path": path,
|
|||
|
|
"secure": secure.upper() == "TRUE",
|
|||
|
|
"httpOnly": False,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if expiry.isdigit() and int(expiry) > 0:
|
|||
|
|
cookie["expires"] = int(expiry)
|
|||
|
|
|
|||
|
|
cookies.append(cookie)
|
|||
|
|
|
|||
|
|
return cookies
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def download_url(url: str, filepath: str, headers: Optional[Dict] = None):
|
|||
|
|
headers = headers or {}
|
|||
|
|
timeout = aiohttp.ClientTimeout(total=300)
|
|||
|
|
|
|||
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|||
|
|
async with session.get(url, headers=headers) as resp:
|
|||
|
|
if resp.status != 200:
|
|||
|
|
raise DouyinFetchError(f"下载失败 {resp.status}: {url}")
|
|||
|
|
|
|||
|
|
with open(filepath, "wb") as f:
|
|||
|
|
async for chunk in resp.content.iter_chunked(1024 * 64):
|
|||
|
|
f.write(chunk)
|
|||
|
|
|
|||
|
|
|
|||
|
|
async def fetch_douyin_mp4(
|
|||
|
|
douyin_url: str,
|
|||
|
|
cookies_txt: str,
|
|||
|
|
wait_seconds: int = 15,
|
|||
|
|
) -> str:
|
|||
|
|
"""
|
|||
|
|
输入抖音 URL
|
|||
|
|
输出:合并后的 MP4 临时文件路径
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
cookies = parse_netscape_cookies(cookies_txt)
|
|||
|
|
if not cookies:
|
|||
|
|
raise DouyinFetchError("cookies.txt 解析失败或为空")
|
|||
|
|
|
|||
|
|
video_url: Optional[str] = None
|
|||
|
|
audio_url: Optional[str] = None
|
|||
|
|
|
|||
|
|
async with async_playwright() as p:
|
|||
|
|
browser = await p.chromium.launch(
|
|||
|
|
executable_path="C:/Program Files/Google/Chrome/Application/chrome.exe",
|
|||
|
|
headless=False,
|
|||
|
|
args=[
|
|||
|
|
"--autoplay-policy=no-user-gesture-required",
|
|||
|
|
"--disable-features=AutoplayDisableSuppression",
|
|||
|
|
"--use-fake-ui-for-media-stream",
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
context = await browser.new_context(
|
|||
|
|
viewport={"width": 640, "height": 320},
|
|||
|
|
user_agent=(
|
|||
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|||
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|||
|
|
"Chrome/122.0.0.0 Safari/537.36"
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
await context.add_cookies(cookies)
|
|||
|
|
page = await context.new_page()
|
|||
|
|
|
|||
|
|
def handle_response(response):
|
|||
|
|
nonlocal video_url, audio_url
|
|||
|
|
url = response.url
|
|||
|
|
# 分轨视频
|
|||
|
|
if video_url is None and ("video" in url and "mime_type=video_mp4" in url):
|
|||
|
|
video_url = url
|
|||
|
|
elif audio_url is None and ("audio" in url and "mime_type=audio" in url):
|
|||
|
|
audio_url = url
|
|||
|
|
|
|||
|
|
page.on("response", handle_response)
|
|||
|
|
|
|||
|
|
await page.goto(douyin_url, wait_until="domcontentloaded")
|
|||
|
|
await page.wait_for_timeout(wait_seconds * 1000)
|
|||
|
|
await browser.close()
|
|||
|
|
|
|||
|
|
if not video_url:
|
|||
|
|
raise DouyinFetchError("未捕获视频流 URL")
|
|||
|
|
|
|||
|
|
# -----------------------------
|
|||
|
|
# 下载到临时文件
|
|||
|
|
# -----------------------------
|
|||
|
|
tmp_dir = Path(tempfile.gettempdir())
|
|||
|
|
timestamp = int(time.time() * 1000)
|
|||
|
|
|
|||
|
|
tmp_video = tmp_dir / f"douyin/{timestamp}_video.mp4"
|
|||
|
|
tmp_out = tmp_dir / f"douyin/{timestamp}_out.mp4"
|
|||
|
|
|
|||
|
|
# 下载视频
|
|||
|
|
await download_url(video_url, tmp_video)
|
|||
|
|
|
|||
|
|
if audio_url:
|
|||
|
|
tmp_audio = tmp_dir / f"{timestamp}_audio.m4a"
|
|||
|
|
try:
|
|||
|
|
await download_url(audio_url, tmp_audio)
|
|||
|
|
(
|
|||
|
|
ffmpeg
|
|||
|
|
.input(tmp_video)
|
|||
|
|
.output(tmp_audio, tmp_out, vcodec="copy", acodec="aac", movflags="faststart")
|
|||
|
|
.overwrite_output()
|
|||
|
|
.run(quiet=True)
|
|||
|
|
)
|
|||
|
|
finally:
|
|||
|
|
tmp_video.unlink(missing_ok=True)
|
|||
|
|
tmp_audio.unlink(missing_ok=True)
|
|||
|
|
else:
|
|||
|
|
# 视频自带音频,直接拷贝到输出
|
|||
|
|
shutil.copy(tmp_video, tmp_out)
|
|||
|
|
tmp_video.unlink(missing_ok=True)
|
|||
|
|
|
|||
|
|
return tmp_out
|