463 lines
15 KiB
Python
463 lines
15 KiB
Python
import time
|
||
import shutil
|
||
import requests
|
||
import json
|
||
import re
|
||
import sys
|
||
import os
|
||
import html
|
||
import tempfile
|
||
import asyncio
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
import hashlib
|
||
from nonebot import on_message, logger
|
||
from nonebot.rule import to_me
|
||
from nonebot.adapters import Event
|
||
|
||
from yt_dlp import YoutubeDL
|
||
from httpx import AsyncClient
|
||
import msgspec
|
||
|
||
from .minio import upload_to_s3
|
||
|
||
# 匹配链接的正则
|
||
URL_PATTERN = re.compile(r"(https?://[^\s]+)")
|
||
pattern = r"QQ小程序(?:]|]|\])"
|
||
|
||
# 保留原有平台识别,同时加入抖音相关域名
|
||
VALID_HOSTS = [
|
||
"b23.tv",
|
||
"bilibili.com",
|
||
"youtube.com",
|
||
"youtu.be",
|
||
# douyin
|
||
"douyin.com",
|
||
"v.douyin.com",
|
||
"iesdouyin.com",
|
||
"m.douyin.com",
|
||
"jingxuan.douyin.com",
|
||
]
|
||
|
||
# 抖音短链识别
|
||
SHORT_LINK_PATTERN = re.compile(r"(v\.douyin\.com/[A-Za-z0-9_\-]+)")
|
||
# 从 HTML 提取路由数据
|
||
DOUYIN_ROUTER_PATTERN = re.compile(r"window\._ROUTER_DATA\s*=\s*(.*?)</script>", re.DOTALL)
|
||
|
||
video_handler = on_message(priority=10, block=False, rule=to_me())
|
||
|
||
|
||
@video_handler.handle()
|
||
async def handle_video_download(event: Event):
|
||
msg = str(event.get_message()).strip()
|
||
logger.info(f"获取到的消息:{msg}")
|
||
target_url: Optional[str] = None
|
||
|
||
# 提取 URL
|
||
if not re.search(pattern, msg):
|
||
urls = URL_PATTERN.findall(msg)
|
||
for u in urls:
|
||
logger.info(f"链接:{u}")
|
||
|
||
# 优先处理抖音(单独解析以获取直链)
|
||
if "douyin.com" in u or "v.douyin.com" in u or "iesdouyin.com" in u:
|
||
await video_handler.send("检测到链接,正在尝试下载视频,请稍候...")
|
||
try:
|
||
parsed_path = await parse_douyin(u)
|
||
if parsed_path:
|
||
# 抖音直接获取直链并且合并文件后返回文件地址后直接上传s3
|
||
logger.info(f"文件路径:{parsed_path}")
|
||
# 构建path对象
|
||
await video_handler.send(f"视频:{parsed_path.name} 已缓存\n正在生成下载链接,请耐心等待!")
|
||
s3_url = upload_to_s3(parsed_path)
|
||
await video_handler.send(f"{s3_url}")
|
||
break
|
||
else:
|
||
await video_handler.send(f"抖音解析失败或未找到直链:{u}")
|
||
logger.warning(f"抖音解析失败或未找到直链:{u}")
|
||
# 如果解析失败,继续尝试其他链接
|
||
continue
|
||
except Exception as e:
|
||
await video_handler.send(f"解析抖音链接时出错:{e}")
|
||
logger.exception(f"解析抖音链接时出错:{e}")
|
||
continue
|
||
|
||
# 非抖音走原逻辑
|
||
if any(domain in u for domain in VALID_HOSTS):
|
||
target_url = u
|
||
break
|
||
|
||
if not target_url:
|
||
return # 不处理无效链接
|
||
|
||
await video_handler.send("检测到链接,正在尝试下载视频,请稍候...")
|
||
else:
|
||
target_url = None
|
||
logger.info("检测到小程序")
|
||
url = await proc_xcx(msg)
|
||
logger.info(f"链接:{url}")
|
||
if url and any(domain in url for domain in VALID_HOSTS):
|
||
target_url = url
|
||
|
||
if not target_url:
|
||
return # 不处理无效链接
|
||
await video_handler.send("检测到视频,正在尝试下载视频,请稍候...")
|
||
|
||
try:
|
||
video_file = await download_video(target_url)
|
||
if not video_file:
|
||
await video_handler.send("视频下载失败。")
|
||
return
|
||
|
||
await video_handler.send(f"视频已缓存:{video_file.name}\n正在生成下载链接,请耐心等待!")
|
||
logger.info(f"文件路径:{video_file}")
|
||
s3_url = upload_to_s3(video_file)
|
||
logger.info(f"文件上传完成,返回链接:{s3_url}")
|
||
await video_handler.send(f"{s3_url}")
|
||
except Exception as e:
|
||
logger.exception(e)
|
||
await video_handler.send("下载过程中出现错误。")
|
||
|
||
|
||
async def download_video(url: str) -> Optional[Path]:
|
||
"""
|
||
如果 url 看起来是直接的媒体直链(如 mp4),则使用 httpx 直接下载;
|
||
否则回退到 yt-dlp 下载(兼容多平台),并支持:
|
||
1) Firefox cookies
|
||
2) cookies.txt 兜底
|
||
"""
|
||
|
||
# ---------- 1. 直链探测 ----------
|
||
direct_media_ext = re.search(r"\.(mp4|m3u8|ts|webm|mov|flv)(?:$|\?)", url, re.IGNORECASE)
|
||
is_direct = bool(direct_media_ext)
|
||
|
||
if not is_direct:
|
||
try:
|
||
async with AsyncClient(follow_redirects=True, timeout=30) as client:
|
||
head = await client.head(url, follow_redirects=True)
|
||
ctype = head.headers.get("content-type", "")
|
||
if ctype.startswith("video/") or "application/octet-stream" in ctype:
|
||
is_direct = True
|
||
except Exception:
|
||
is_direct = False
|
||
|
||
if is_direct:
|
||
temp_dir = tempfile.mkdtemp(prefix="direct_ytcache_")
|
||
ext = "mp4"
|
||
m = re.search(r"\.([a-zA-Z0-9]{2,5})(?:$|\?)", url)
|
||
if m and len(m.group(1)) <= 5:
|
||
ext = m.group(1)
|
||
|
||
filename = os.path.join(temp_dir, f"downloaded_video.{ext}")
|
||
|
||
try:
|
||
async with AsyncClient(follow_redirects=True, timeout=120) as client:
|
||
async with client.stream("GET", url) as resp:
|
||
resp.raise_for_status()
|
||
with open(filename, "wb") as fh:
|
||
async for chunk in resp.aiter_bytes(chunk_size=8192):
|
||
fh.write(chunk)
|
||
|
||
p = Path(filename)
|
||
new_path = p.with_name(clean_filename(p.name))
|
||
p.rename(new_path)
|
||
logger.info(f"直接下载完成: {new_path}")
|
||
return new_path
|
||
|
||
except Exception:
|
||
logger.exception("直接下载失败,回退 yt-dlp")
|
||
try:
|
||
if os.path.exists(filename):
|
||
os.remove(filename)
|
||
except Exception:
|
||
pass
|
||
|
||
# ---------- 2. yt-dlp 下载 ----------
|
||
temp_dir = tempfile.mkdtemp(prefix="ytcache_")
|
||
output_path = os.path.join(temp_dir, "%(title).80s.%(ext)s")
|
||
|
||
base_opts = {
|
||
"outtmpl": output_path,
|
||
"format": "bestvideo+bestaudio/best",
|
||
"noplaylist": True,
|
||
"quiet": True,
|
||
"ffmpeg_location": get_ffmpeg_path(),
|
||
"extractor_args": {
|
||
"youtube": {
|
||
"player_client": ["android"]
|
||
}
|
||
},
|
||
}
|
||
|
||
loop = asyncio.get_event_loop()
|
||
|
||
def _run_yt(opts: dict):
|
||
with YoutubeDL(opts) as ydl:
|
||
ydl.download([url])
|
||
|
||
# ---------- 2.1 优先:Firefox cookies ----------
|
||
try:
|
||
opts = dict(base_opts)
|
||
opts["cookiesfrombrowser"] = ("firefox",)
|
||
logger.info("尝试使用 Firefox cookies 下载")
|
||
await loop.run_in_executor(None, _run_yt, opts)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"Firefox cookies 下载失败,尝试 cookies.txt:{e}")
|
||
|
||
# ---------- 2.2 回退:cookies.txt ----------
|
||
cookie_path = Path(__file__).resolve().parent / "cookies.txt"
|
||
if cookie_path.exists():
|
||
opts = dict(base_opts)
|
||
opts["cookiefile"] = str(cookie_path)
|
||
logger.info(f"使用 cookies 文件: {cookie_path}")
|
||
await loop.run_in_executor(None, _run_yt, opts)
|
||
else:
|
||
logger.error("未找到 cookies.txt,无法通过登录验证")
|
||
raise
|
||
|
||
# ---------- 3. 结果处理 ----------
|
||
files = list(Path(temp_dir).glob("*.*"))
|
||
if not files:
|
||
return None
|
||
|
||
original_file = files[0]
|
||
new_path = original_file.with_name(clean_filename(original_file.name))
|
||
original_file.rename(new_path)
|
||
logger.info(f"下载完成并重命名: {new_path}")
|
||
|
||
return new_path
|
||
|
||
|
||
def get_ffmpeg_path() -> str:
|
||
scripts_dir = os.path.dirname(sys.executable)
|
||
ffmpeg_path = os.path.join(scripts_dir, "ffmpeg.exe")
|
||
if os.path.exists(ffmpeg_path):
|
||
return ffmpeg_path
|
||
return "ffmpeg"
|
||
|
||
|
||
def clean_filename(filename: str) -> str:
|
||
return (
|
||
filename.replace(" ", "_")
|
||
.replace("&", "_")
|
||
.replace("#", "_")
|
||
.replace("'", "")
|
||
.replace('"', "")
|
||
.replace("?", "")
|
||
.replace(":", "_")
|
||
.replace("|", "_")
|
||
.replace("/", "_")
|
||
.replace("\\", "_")
|
||
.replace("*", "_")
|
||
.replace("<", "_")
|
||
.replace(">", "_")
|
||
.replace("【", "")
|
||
.replace("】", "")
|
||
.replace(":", "")
|
||
.replace("。", "")
|
||
.replace(",", "_")
|
||
.replace("《", "")
|
||
.replace("》", "")
|
||
.replace("?", "_")
|
||
.replace("|", "")
|
||
)
|
||
|
||
|
||
async def proc_xcx(msg):
|
||
p_pattern = r'"qqdocurl":"(.*?)"'
|
||
match = re.search(p_pattern, msg)
|
||
|
||
if match:
|
||
raw_url = match.group(1)
|
||
unescaped = html.unescape(raw_url)
|
||
cleaned_url = unescaped.replace(r"\\/", "/")
|
||
base_url = cleaned_url.split("?")[0]
|
||
print("最终提取链接:", base_url)
|
||
return base_url
|
||
else:
|
||
print("未找到 qqdocurl 字段")
|
||
return None
|
||
|
||
|
||
# ----------------- Douyin parsing helpers -----------------
|
||
|
||
async def parse_douyin(url: str) -> Optional[str]:
|
||
"""
|
||
解析抖音链接,返回可直接下载的媒体直链(优先)或 None。
|
||
支持短链(v.douyin.com)、长链、iesdouyin、m.douyin 等。
|
||
"""
|
||
# 处理短链
|
||
short = SHORT_LINK_PATTERN.search(url)
|
||
if short:
|
||
short_url = "https://" + short.group(1)
|
||
try:
|
||
cookies_path = os.path.dirname(__file__).replace("\\", "/") + "/cookies.txt"
|
||
file_path = await fetch_douyin_mp4(short_url, cookies_path, 10)
|
||
return file_path
|
||
except Exception:
|
||
logger.exception("获取直链失败")
|
||
# 仍然尝试原始 url
|
||
|
||
# 现在尝试从页面抓取 router data
|
||
try:
|
||
logger.info(f"获取到的信息:{url}")
|
||
except Exception:
|
||
logger.exception("抖音页面解析失败")
|
||
return None
|
||
|
||
|
||
import asyncio
|
||
import tempfile
|
||
import aiohttp
|
||
import os
|
||
from typing import List, Dict, Optional
|
||
|
||
from playwright.async_api import async_playwright
|
||
import ffmpeg
|
||
|
||
|
||
class DouyinFetchError(Exception):
|
||
pass
|
||
|
||
|
||
def parse_netscape_cookies(cookies_txt_path: str) -> List[Dict]:
|
||
"""
|
||
解析 Netscape HTTP Cookie File -> Playwright cookies
|
||
"""
|
||
cookies: List[Dict] = []
|
||
|
||
with open(cookies_txt_path, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line or line.startswith("#"):
|
||
continue
|
||
|
||
parts = line.split("\t")
|
||
if len(parts) != 7:
|
||
continue
|
||
|
||
domain, include_sub, path, secure, expiry, name, value = parts
|
||
|
||
cookie = {
|
||
"name": name,
|
||
"value": value,
|
||
"domain": domain,
|
||
"path": path,
|
||
"secure": secure.upper() == "TRUE",
|
||
"httpOnly": False,
|
||
}
|
||
|
||
if expiry.isdigit() and int(expiry) > 0:
|
||
cookie["expires"] = int(expiry)
|
||
|
||
cookies.append(cookie)
|
||
|
||
return cookies
|
||
|
||
|
||
async def download_url(url: str, filepath: str, headers: Optional[Dict] = None):
|
||
headers = headers or {}
|
||
timeout = aiohttp.ClientTimeout(total=300)
|
||
|
||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||
async with session.get(url, headers=headers) as resp:
|
||
if resp.status != 200:
|
||
raise DouyinFetchError(f"下载失败 {resp.status}: {url}")
|
||
|
||
with open(filepath, "wb") as f:
|
||
async for chunk in resp.content.iter_chunked(1024 * 64):
|
||
f.write(chunk)
|
||
|
||
|
||
async def fetch_douyin_mp4(
|
||
douyin_url: str,
|
||
cookies_txt: str,
|
||
wait_seconds: int = 15,
|
||
) -> str:
|
||
"""
|
||
输入抖音 URL
|
||
输出:合并后的 MP4 临时文件路径
|
||
"""
|
||
|
||
cookies = parse_netscape_cookies(cookies_txt)
|
||
if not cookies:
|
||
raise DouyinFetchError("cookies.txt 解析失败或为空")
|
||
|
||
video_url: Optional[str] = None
|
||
audio_url: Optional[str] = None
|
||
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(
|
||
executable_path="C:/Program Files/Google/Chrome/Application/chrome.exe",
|
||
headless=False,
|
||
args=[
|
||
"--autoplay-policy=no-user-gesture-required",
|
||
"--disable-features=AutoplayDisableSuppression",
|
||
"--use-fake-ui-for-media-stream",
|
||
]
|
||
)
|
||
|
||
context = await browser.new_context(
|
||
viewport={"width": 640, "height": 320},
|
||
user_agent=(
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/122.0.0.0 Safari/537.36"
|
||
),
|
||
)
|
||
|
||
await context.add_cookies(cookies)
|
||
page = await context.new_page()
|
||
|
||
def handle_response(response):
|
||
nonlocal video_url, audio_url
|
||
url = response.url
|
||
# 分轨视频
|
||
if video_url is None and ("video" in url and "mime_type=video_mp4" in url):
|
||
video_url = url
|
||
elif audio_url is None and ("audio" in url and "mime_type=audio" in url):
|
||
audio_url = url
|
||
|
||
page.on("response", handle_response)
|
||
|
||
await page.goto(douyin_url, wait_until="domcontentloaded")
|
||
await page.wait_for_timeout(wait_seconds * 1000)
|
||
await browser.close()
|
||
|
||
if not video_url:
|
||
raise DouyinFetchError("未捕获视频流 URL")
|
||
|
||
# -----------------------------
|
||
# 下载到临时文件
|
||
# -----------------------------
|
||
tmp_dir = Path(tempfile.gettempdir())
|
||
timestamp = int(time.time() * 1000)
|
||
|
||
tmp_video = tmp_dir / f"douyin/{timestamp}_video.mp4"
|
||
tmp_out = tmp_dir / f"douyin/{timestamp}_out.mp4"
|
||
|
||
# 下载视频
|
||
await download_url(video_url, tmp_video)
|
||
|
||
if audio_url:
|
||
tmp_audio = tmp_dir / f"{timestamp}_audio.m4a"
|
||
try:
|
||
await download_url(audio_url, tmp_audio)
|
||
(
|
||
ffmpeg
|
||
.input(tmp_video)
|
||
.output(tmp_audio, tmp_out, vcodec="copy", acodec="aac", movflags="faststart")
|
||
.overwrite_output()
|
||
.run(quiet=True)
|
||
)
|
||
finally:
|
||
tmp_video.unlink(missing_ok=True)
|
||
tmp_audio.unlink(missing_ok=True)
|
||
else:
|
||
# 视频自带音频,直接拷贝到输出
|
||
shutil.copy(tmp_video, tmp_out)
|
||
tmp_video.unlink(missing_ok=True)
|
||
|
||
return tmp_out
|