HeXi/hexi/plugins/fuck_pilipili/__init__.py

import time
import shutil
import requests
import json
import re
import sys
import os
import html
import tempfile
import asyncio
from pathlib import Path
from typing import Optional
import hashlib
from nonebot import on_message, logger
from nonebot.rule import to_me
from nonebot.adapters import Event

from yt_dlp import YoutubeDL
from httpx import AsyncClient
import msgspec

from .minio import upload_to_s3

# 匹配链接的正则
URL_PATTERN = re.compile(r"(https?://[^\s]+)")
pattern = r"QQ小程序(?:&amp;#93;|&#93;|\])"

# 保留原有平台识别，同时加入抖音相关域名
VALID_HOSTS = [
    "b23.tv",
    "bilibili.com",
    "youtube.com",
    "youtu.be",
    # douyin
    "douyin.com",
    "v.douyin.com",
    "iesdouyin.com",
    "m.douyin.com",
    "jingxuan.douyin.com",
]

# 抖音短链识别
SHORT_LINK_PATTERN = re.compile(r"(v\.douyin\.com/[A-Za-z0-9_\-]+)")
# 从 HTML 提取路由数据
DOUYIN_ROUTER_PATTERN = re.compile(r"window\._ROUTER_DATA\s*=\s*(.*?)</script>", re.DOTALL)

video_handler = on_message(priority=10, block=False, rule=to_me())


@video_handler.handle()
async def handle_video_download(event: Event):
    msg = str(event.get_message()).strip()
    logger.info(f"获取到的消息：{msg}")
    target_url: Optional[str] = None

    # 提取 URL
    if not re.search(pattern, msg):
        urls = URL_PATTERN.findall(msg)
        for u in urls:
            logger.info(f"链接：{u}")

            # 优先处理抖音（单独解析以获取直链）
            if "douyin.com" in u or "v.douyin.com" in u or "iesdouyin.com" in u:
                await video_handler.send("检测到链接，正在尝试下载视频，请稍候...")
                try:
                    parsed_path = await parse_douyin(u)
                    if parsed_path:
                        # 抖音直接获取直链并且合并文件后返回文件地址后直接上传s3
                        logger.info(f"文件路径：{parsed_path}")
                        # 构建path对象
                        await video_handler.send(f"视频：{parsed_path.name} 已缓存\n正在生成下载链接，请耐心等待!")
                        s3_url = upload_to_s3(parsed_path)
                        await video_handler.send(f"{s3_url}")
                        break
                    else:
                        await video_handler.send(f"抖音解析失败或未找到直链：{u}")
                        logger.warning(f"抖音解析失败或未找到直链：{u}")
                        # 如果解析失败，继续尝试其他链接
                        continue
                except Exception as e:
                    await video_handler.send(f"解析抖音链接时出错：{e}")
                    logger.exception(f"解析抖音链接时出错：{e}")
                    continue

            # 非抖音走原逻辑
            if any(domain in u for domain in VALID_HOSTS):
                target_url = u
                break

        if not target_url:
            return  # 不处理无效链接

        await video_handler.send("检测到链接，正在尝试下载视频，请稍候...")
    else:
        target_url = None
        logger.info("检测到小程序")
        url = await proc_xcx(msg)
        logger.info(f"链接：{url}")
        if url and any(domain in url for domain in VALID_HOSTS):
            target_url = url

        if not target_url:
            return  # 不处理无效链接
        await video_handler.send("检测到视频，正在尝试下载视频，请稍候...")

    try:
        video_file = await download_video(target_url)
        if not video_file:
            await video_handler.send("视频下载失败。")
            return

        await video_handler.send(f"视频已缓存：{video_file.name}\n正在生成下载链接，请耐心等待!")
        logger.info(f"文件路径：{video_file}")
        s3_url = upload_to_s3(video_file)
        logger.info(f"文件上传完成，返回链接：{s3_url}")
        await video_handler.send(f"{s3_url}")
    except Exception as e:
        logger.exception(e)
        await video_handler.send("下载过程中出现错误。")


async def download_video(url: str) -> Optional[Path]:
    """
    如果 url 看起来是直接的媒体直链（如 mp4），则使用 httpx 直接下载；
    否则回退到 yt-dlp 下载（兼容多平台），并支持：
    1) Firefox cookies
    2) cookies.txt 兜底
    """

    # ---------- 1. 直链探测 ----------
    direct_media_ext = re.search(r"\.(mp4|m3u8|ts|webm|mov|flv)(?:$|\?)", url, re.IGNORECASE)
    is_direct = bool(direct_media_ext)

    if not is_direct:
        try:
            async with AsyncClient(follow_redirects=True, timeout=30) as client:
                head = await client.head(url, follow_redirects=True)
                ctype = head.headers.get("content-type", "")
                if ctype.startswith("video/") or "application/octet-stream" in ctype:
                    is_direct = True
        except Exception:
            is_direct = False

    if is_direct:
        temp_dir = tempfile.mkdtemp(prefix="direct_ytcache_")
        ext = "mp4"
        m = re.search(r"\.([a-zA-Z0-9]{2,5})(?:$|\?)", url)
        if m and len(m.group(1)) <= 5:
            ext = m.group(1)

        filename = os.path.join(temp_dir, f"downloaded_video.{ext}")

        try:
            async with AsyncClient(follow_redirects=True, timeout=120) as client:
                async with client.stream("GET", url) as resp:
                    resp.raise_for_status()
                    with open(filename, "wb") as fh:
                        async for chunk in resp.aiter_bytes(chunk_size=8192):
                            fh.write(chunk)

            p = Path(filename)
            new_path = p.with_name(clean_filename(p.name))
            p.rename(new_path)
            logger.info(f"直接下载完成: {new_path}")
            return new_path

        except Exception:
            logger.exception("直接下载失败，回退 yt-dlp")
            try:
                if os.path.exists(filename):
                    os.remove(filename)
            except Exception:
                pass

    # ---------- 2. yt-dlp 下载 ----------
    temp_dir = tempfile.mkdtemp(prefix="ytcache_")
    output_path = os.path.join(temp_dir, "%(title).80s.%(ext)s")

    base_opts = {
        "outtmpl": output_path,
        "format": "bestvideo+bestaudio/best",
        "noplaylist": True,
        "quiet": True,
        "ffmpeg_location": get_ffmpeg_path(),
        "extractor_args": {
            "youtube": {
                "player_client": ["android"]
            }
        },
    }

    loop = asyncio.get_event_loop()

    def _run_yt(opts: dict):
        with YoutubeDL(opts) as ydl:
            ydl.download([url])

    # ---------- 2.1 优先：Firefox cookies ----------
    try:
        opts = dict(base_opts)
        opts["cookiesfrombrowser"] = ("firefox",)
        logger.info("尝试使用 Firefox cookies 下载")
        await loop.run_in_executor(None, _run_yt, opts)

    except Exception as e:
        logger.warning(f"Firefox cookies 下载失败，尝试 cookies.txt：{e}")

        # ---------- 2.2 回退：cookies.txt ----------
        cookie_path = Path(__file__).resolve().parent / "cookies.txt"
        if cookie_path.exists():
            opts = dict(base_opts)
            opts["cookiefile"] = str(cookie_path)
            logger.info(f"使用 cookies 文件: {cookie_path}")
            await loop.run_in_executor(None, _run_yt, opts)
        else:
            logger.error("未找到 cookies.txt，无法通过登录验证")
            raise

    # ---------- 3. 结果处理 ----------
    files = list(Path(temp_dir).glob("*.*"))
    if not files:
        return None

    original_file = files[0]
    new_path = original_file.with_name(clean_filename(original_file.name))
    original_file.rename(new_path)
    logger.info(f"下载完成并重命名: {new_path}")

    return new_path


def get_ffmpeg_path() -> str:
    scripts_dir = os.path.dirname(sys.executable)
    ffmpeg_path = os.path.join(scripts_dir, "ffmpeg.exe")
    if os.path.exists(ffmpeg_path):
        return ffmpeg_path
    return "ffmpeg"


def clean_filename(filename: str) -> str:
    return (
        filename.replace(" ", "_")
        .replace("&", "_")
        .replace("#", "_")
        .replace("'", "")
        .replace('"', "")
        .replace("?", "")
        .replace(":", "_")
        .replace("|", "_")
        .replace("/", "_")
        .replace("\\", "_")
        .replace("*", "_")
        .replace("<", "_")
        .replace(">", "_")
        .replace("【", "")
        .replace("】", "")
        .replace("：", "")
        .replace("。", "")
        .replace("，", "_")
        .replace("《", "")
        .replace("》", "")
        .replace("？", "_")
        .replace("｜", "")
    )


async def proc_xcx(msg):
    p_pattern = r'"qqdocurl":"(.*?)"'
    match = re.search(p_pattern, msg)

    if match:
        raw_url = match.group(1)
        unescaped = html.unescape(raw_url)
        cleaned_url = unescaped.replace(r"\\/", "/")
        base_url = cleaned_url.split("?")[0]
        print("最终提取链接:", base_url)
        return base_url
    else:
        print("未找到 qqdocurl 字段")
        return None


# ----------------- Douyin parsing helpers -----------------

async def parse_douyin(url: str) -> Optional[str]:
    """
    解析抖音链接，返回可直接下载的媒体直链（优先）或 None。
    支持短链(v.douyin.com)、长链、iesdouyin、m.douyin 等。
    """
    # 处理短链
    short = SHORT_LINK_PATTERN.search(url)
    if short:
        short_url = "https://" + short.group(1)
        try:
            cookies_path = os.path.dirname(__file__).replace("\\", "/") + "/cookies.txt"
            file_path = await fetch_douyin_mp4(short_url, cookies_path, 10)
            return file_path
        except Exception:
            logger.exception("获取直链失败")
            # 仍然尝试原始 url

    # 现在尝试从页面抓取 router data
    try:
        logger.info(f"获取到的信息:{url}")
    except Exception:
        logger.exception("抖音页面解析失败")
        return None


import asyncio
import tempfile
import aiohttp
import os
from typing import List, Dict, Optional

from playwright.async_api import async_playwright
import ffmpeg


class DouyinFetchError(Exception):
    pass


def parse_netscape_cookies(cookies_txt_path: str) -> List[Dict]:
    """
    解析 Netscape HTTP Cookie File -> Playwright cookies
    """
    cookies: List[Dict] = []

    with open(cookies_txt_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            parts = line.split("\t")
            if len(parts) != 7:
                continue

            domain, include_sub, path, secure, expiry, name, value = parts

            cookie = {
                "name": name,
                "value": value,
                "domain": domain,
                "path": path,
                "secure": secure.upper() == "TRUE",
                "httpOnly": False,
            }

            if expiry.isdigit() and int(expiry) > 0:
                cookie["expires"] = int(expiry)

            cookies.append(cookie)

    return cookies


async def download_url(url: str, filepath: str, headers: Optional[Dict] = None):
    headers = headers or {}
    timeout = aiohttp.ClientTimeout(total=300)

    async with aiohttp.ClientSession(timeout=timeout) as session:
        async with session.get(url, headers=headers) as resp:
            if resp.status != 200:
                raise DouyinFetchError(f"下载失败 {resp.status}: {url}")

            with open(filepath, "wb") as f:
                async for chunk in resp.content.iter_chunked(1024 * 64):
                    f.write(chunk)


async def fetch_douyin_mp4(
        douyin_url: str,
        cookies_txt: str,
        wait_seconds: int = 15,
) -> str:
    """
    输入抖音 URL
    输出：合并后的 MP4 临时文件路径
    """

    cookies = parse_netscape_cookies(cookies_txt)
    if not cookies:
        raise DouyinFetchError("cookies.txt 解析失败或为空")

    video_url: Optional[str] = None
    audio_url: Optional[str] = None

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            executable_path="C:/Program Files/Google/Chrome/Application/chrome.exe",
            headless=False,
            args=[
                "--autoplay-policy=no-user-gesture-required",
                "--disable-features=AutoplayDisableSuppression",
                "--use-fake-ui-for-media-stream",
            ]
        )

        context = await browser.new_context(
            viewport={"width": 640, "height": 320},
            user_agent=(
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36"
            ),
        )

        await context.add_cookies(cookies)
        page = await context.new_page()

        def handle_response(response):
            nonlocal video_url, audio_url
            url = response.url
            # 分轨视频
            if video_url is None and ("video" in url and "mime_type=video_mp4" in url):
                video_url = url
            elif audio_url is None and ("audio" in url and "mime_type=audio" in url):
                audio_url = url

        page.on("response", handle_response)

        await page.goto(douyin_url, wait_until="domcontentloaded")
        await page.wait_for_timeout(wait_seconds * 1000)
        await browser.close()

    if not video_url:
        raise DouyinFetchError("未捕获视频流 URL")

    # -----------------------------
    # 下载到临时文件
    # -----------------------------
    tmp_dir = Path(tempfile.gettempdir())
    timestamp = int(time.time() * 1000)

    tmp_video = tmp_dir / f"douyin/{timestamp}_video.mp4"
    tmp_out = tmp_dir / f"douyin/{timestamp}_out.mp4"

    # 下载视频
    await download_url(video_url, tmp_video)

    if audio_url:
        tmp_audio = tmp_dir / f"{timestamp}_audio.m4a"
        try:
            await download_url(audio_url, tmp_audio)
            (
                ffmpeg
                .input(tmp_video)
                .output(tmp_audio, tmp_out, vcodec="copy", acodec="aac", movflags="faststart")
                .overwrite_output()
                .run(quiet=True)
            )
        finally:
            tmp_video.unlink(missing_ok=True)
            tmp_audio.unlink(missing_ok=True)
    else:
        # 视频自带音频，直接拷贝到输出
        shutil.copy(tmp_video, tmp_out)
        tmp_video.unlink(missing_ok=True)

    return tmp_out