ZImageTurbo/poetry_to_image_qwen.py

"""
古诗词意境图生成器（Qwen-Image 云端版）
将中国古典诗词通过 LLM 分析拆解为多个意境画面，
再使用 Qwen-Image API（通过 SiliconFlow）逐一生成高质量图片。
"""

import argparse
import json
import os
import re
import sys
import time
from datetime import datetime
from io import BytesIO
from pathlib import Path

import requests as http_requests
import yaml
from openai import OpenAI
from PIL import Image


# ---------------------------------------------------------------------------
# 配置加载
# ---------------------------------------------------------------------------

def load_config(config_path: str = "config_qwen.yaml") -> dict:
    with open(config_path, "r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)

    api_key = os.environ.get("LLM_API_KEY") or cfg["llm"].get("api_key", "")
    cfg["llm"]["api_key"] = api_key

    img_api_key = (
        os.environ.get("IMAGE_API_KEY")
        or cfg["image"].get("api_key", "")
        or api_key
    )
    cfg["image"]["api_key"] = img_api_key
    return cfg


# ---------------------------------------------------------------------------
# LLM 古诗词分析
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = """\
你是一位精通中国古典文学与视觉艺术的大师，同时深谙文生图 AI 的 prompt 工程。\
你的任务是分析用户提供的古诗词，将其意境拆解为若干幅独立的画面，\
每幅画面对应诗词中一个完整的意象或场景。

## 核心原则：信、雅、达

1. **信**（忠实）：画面内容必须忠于原诗的意象、情感和时代背景，不可凭空臆造。\
诗中有月则画月，诗中无人则不强加人物。
2. **雅**（优美）：画面描述应体现中国传统美学，注重意境营造、留白与含蓄之美。
3. **达**（通畅）：prompt 要清晰、具体、富有画面感，\
能被文生图模型准确理解并生成高质量图像。

## 诗词体裁识别与风格匹配

请先识别诗词的体裁（唐诗/宋词/元曲/其他），再根据题材选择最合适的中国传统画风。\
以下是可选的风格菜单，请根据诗意灵活选取，同一首诗的不同画面可以使用不同风格：

| 风格 | prompt 关键词 | 适用场景 |
|------|-------------|---------|
| 水墨写意 | 水墨写意，淡墨晕染，留白 | 山水、边塞、禅意、抒情 |
| 青绿山水 | 青绿山水，石青石绿，金碧辉煌 | 春夏山水、游记、壮丽河山 |
| 工笔花鸟 | 工笔花鸟，细腻勾勒，精细渲染 | 花卉、仕女、宫廷、精致细腻 |
| 工笔重彩 | 工笔重彩，浓墨重色，华丽精细 | 华丽、富贵、节庆、历史叙事 |
| 没骨画法 | 没骨画法，不勾轮廓，直接点染 | 花卉、蔬果、清新淡雅 |
| 文人画 | 文人画风格，诗书画印，意趣高远 | 隐逸、高洁、书卷气 |
| 泼墨大写意 | 泼墨大写意，墨色淋漓，气势磅礴 | 豪放、苍茫、雄壮 |
| 界画/建筑 | 界画，工整精细，楼台亭阁 | 楼阁、宫殿、城市场景 |
| 浅绛山水 | 浅绛山水，赭石淡彩，萧疏清远 | 秋冬山水、萧瑟、怀古 |

### 体裁特点提示
- **唐诗**（尤其五七言律绝）：意境开阔，气象宏大，多配水墨写意或青绿山水。
- **宋词**：情感细腻，意象精致，婉约派多配工笔花鸟/没骨，豪放派可配泼墨写意。
- **边塞诗**：苍凉壮阔，适合泼墨大写意或浅绛山水。
- **田园诗**：恬淡自然，适合青绿山水或文人画。
- **咏物诗/闺怨词**：精致细腻，适合工笔花鸟或没骨画法。

## 分析步骤

1. 识别诗词的标题、作者、体裁、题材和情感基调。
2. 逐句/逐联理解字面意思与深层意境。
3. 判断需要多少幅画来完整呈现意境（通常每一联或每一句对应一幅，\
但意境连贯的句子可以合并为一幅）。
4. 为每幅画从上方风格菜单中选择最匹配的画风。
5. 为每幅画撰写**中文 prompt** 和**英文 prompt**，均采用「正向描述」策略（只描述要画什么，\
不描述不要什么），包含：
   - 画面主体（人物、景物、动作、姿态）
   - 环境氛围（季节、天气、光线、时辰、色调）
   - 选定的艺术风格关键词
   - 构图与视角（远景/中景/特写，俯视/平视等）
   - 画面质感（绢本/纸本/留白/墨色浓淡等细节）
   - 画面氛围（清冷/温暖/苍茫/静谧等情感色彩）

### 中文 prompt 要求
- 使用中国传统绘画的专业术语（如水墨写意、工笔重彩、留白等）。
- 具体且富有画面感，避免抽象空泛的概念。

### 英文 prompt 要求
- 中文 prompt 的忠实翻译与适配，保持相同的画面内容和风格意图。
- 使用英文中对应的艺术术语（如 ink wash painting, meticulous brushwork, negative space 等）。
- 自然流畅的英文表达，而非逐字翻译。

## 重要提示
- 文生图模型（Qwen-Image）对中英文 prompt 均有优秀支持，中文表现尤为突出。
- 支持 negative prompt：请为每幅画面生成针对性的 negative_prompt，排除与目标画风冲突的元素。
- 每个 prompt 建议 80-200 字（中文）/ 50-150 词（英文），确保细节充分。
- 必须同时输出中文和英文两个版本的 prompt。

### negative_prompt 编写要点
- 针对所选画风排除冲突风格（如：水墨写意应排除"照片写实, 3D渲染, 油画质感"；\
工笔花鸟应排除"粗犷笔触, 抽象风格, 泼墨"）。
- 排除常见 AI 生成瑕疵（如：肢体变形, 手指畸形, 面部模糊, 文字乱码）。
- 排除与诗词意境不符的元素（如：悲秋诗不应出现"鲜艳色彩, 欢快氛围"）。
- 简洁有效，20-60 字（中文），以逗号分隔。

## 输出格式

严格按照以下 JSON 格式输出，不要包含任何其他文字：

```json
{
  "title": "诗词标题",
  "author": "作者",
  "dynasty": "朝代",
  "genre": "体裁（如：五言绝句、七言律诗、词·水调歌头等）",
  "analysis": "对整首诗意境的简要分析（中文，2-3句话）",
  "images": [
    {
      "scene": "这幅画对应的诗句（原文）",
      "description": "画面内容的中文描述",
      "style": "选用的画风（中文名称）",
      "prompt": "详细的中文文生图提示词，80-200字，仅使用正向描述...",
      "prompt_en": "Detailed English text-to-image prompt, 50-150 words, positive description only...",
      "negative_prompt": "针对该画面的负向提示词，排除与画风冲突的元素和常见瑕疵，20-60字..."
    }
  ]
}
```\
"""


def analyze_poetry(poem: str, cfg: dict) -> dict:
    """调用 LLM 分析古诗词，返回结构化的图片生成方案。"""
    llm_cfg = cfg["llm"]

    client = OpenAI(
        base_url=llm_cfg["base_url"],
        api_key=llm_cfg["api_key"],
    )

    print(f"\n{'='*60}")
    print("正在调用 LLM 分析古诗词意境...")
    print(f"模型: {llm_cfg['model']}")
    print(f"{'='*60}\n")

    response = client.chat.completions.create(
        model=llm_cfg["model"],
        temperature=llm_cfg.get("temperature", 0.7),
        max_tokens=llm_cfg.get("max_tokens", 4096),
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"请分析以下古诗词并生成图片方案：\n\n{poem}"},
        ],
    )

    content = response.choices[0].message.content.strip()

    json_match = re.search(r"```(?:json)?\s*(.*?)```", content, re.DOTALL)
    if json_match:
        content = json_match.group(1).strip()

    try:
        result = json.loads(content)
    except json.JSONDecodeError:
        json_match = re.search(r"\{.*\}", content, re.DOTALL)
        if json_match:
            result = json.loads(json_match.group())
        else:
            print("LLM 返回内容无法解析为 JSON：")
            print(content)
            sys.exit(1)

    return result


def display_analysis(analysis: dict) -> None:
    """友好地展示 LLM 的分析结果。"""
    print(f"\n{'='*60}")
    title = analysis.get("title", "未知")
    author = analysis.get("author", "未知")
    dynasty = analysis.get("dynasty", "")
    genre = analysis.get("genre", "")
    print(f"  {title}  —  {dynasty} · {author}  [{genre}]")
    print(f"{'='*60}")
    print(f"\n  意境分析：{analysis.get('analysis', '')}\n")

    for i, img in enumerate(analysis["images"], 1):
        print(f"{'─'*50}")
        print(f"  第 {i} 幅  |  {img['scene']}")
        print(f"   画风选择：{img.get('style', '未指定')}")
        print(f"   中文描述：{img['description']}")
        print(f"   Prompt(zh)：{img['prompt'][:120]}...")
        if img.get("prompt_en"):
            print(f"   Prompt(en)：{img['prompt_en'][:120]}...")
        if img.get("negative_prompt"):
            print(f"   Negative  ：{img['negative_prompt'][:120]}")

    print(f"\n共 {len(analysis['images'])} 幅画面\n")


# ---------------------------------------------------------------------------
# 尺寸预设（适配 Qwen-Image 推荐分辨率）
# ---------------------------------------------------------------------------

SIZE_PRESETS: dict[str, str] = {
    "square":     "1328x1328",   # 1:1
    "phone":      "928x1664",    # 9:16
    "phone_hd":   "1056x1584",   # 2:3（接近 9:16 高清）
    "desktop":    "1664x928",    # 16:9
    "desktop_hd": "1584x1056",   # 3:2（接近 16:9 高清）
    "landscape":  "1472x1140",   # 4:3
    "portrait":   "1140x1472",   # 3:4
}


def resolve_image_size(img_cfg: dict) -> str:
    """根据 size_preset 或 height/width 配置，返回 'WIDTHxHEIGHT' 字符串。"""
    preset = img_cfg.get("size_preset", "").strip().lower()
    if preset and preset != "custom" and preset in SIZE_PRESETS:
        return SIZE_PRESETS[preset]
    w = img_cfg.get("width", 1328)
    h = img_cfg.get("height", 1328)
    return f"{w}x{h}"


# ---------------------------------------------------------------------------
# Qwen-Image API 图片生成
# ---------------------------------------------------------------------------

def _call_image_api(
    prompt: str,
    cfg: dict,
    seed: int | None = None,
    negative_prompt: str = "",
) -> tuple[str, int]:
    """调用 SiliconFlow Qwen-Image API，返回 (image_url, seed)。

    图片 URL 有效期为 1 小时，调用方应及时下载。
    negative_prompt: 每幅画面专属的负向提示词，会与配置中的全局 negative_prompt 合并。
    """
    img_cfg = cfg["image"]
    base_url = img_cfg.get("base_url", "https://api.siliconflow.cn/v1").rstrip("/")
    api_key = img_cfg["api_key"]

    url = f"{base_url}/images/generations"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }

    payload: dict = {
        "model": img_cfg.get("model", "Qwen/Qwen-Image"),
        "prompt": prompt,
        "image_size": resolve_image_size(img_cfg),
    }

    steps = img_cfg.get("num_inference_steps")
    if steps is not None:
        payload["num_inference_steps"] = steps

    guidance = img_cfg.get("guidance_scale")
    if guidance is not None:
        payload["guidance_scale"] = guidance

    cfg_scale = img_cfg.get("cfg")
    if cfg_scale is not None:
        payload["cfg"] = cfg_scale

    global_negative = img_cfg.get("negative_prompt", "").strip()
    scene_negative = negative_prompt.strip()
    parts = [p for p in (scene_negative, global_negative) if p]
    merged_negative = ", ".join(parts)
    if merged_negative:
        payload["negative_prompt"] = merged_negative

    if seed is not None and seed >= 0:
        payload["seed"] = seed

    max_retries = img_cfg.get("max_retries", 3)
    timeout = img_cfg.get("request_timeout", 180)

    for attempt in range(max_retries):
        try:
            resp = http_requests.post(
                url, headers=headers, json=payload, timeout=timeout
            )

            if resp.status_code == 429:
                wait = min(60, 5 * (attempt + 1))
                print(f"  API 限流 (429)，等待 {wait}s 后重试...")
                time.sleep(wait)
                continue

            if resp.status_code != 200:
                error_detail = resp.text[:500]
                print(f"  API 返回错误 [{resp.status_code}]: {error_detail}")
                if attempt < max_retries - 1:
                    time.sleep(3)
                    continue
                resp.raise_for_status()

            data = resp.json()
            img_url = data["images"][0]["url"]
            returned_seed = data.get("seed", seed if seed and seed >= 0 else 0)
            return img_url, returned_seed

        except http_requests.exceptions.Timeout:
            print(f"  请求超时 ({timeout}s)，" + (
                f"重试 ({attempt+1}/{max_retries})..." if attempt < max_retries - 1 else "已达最大重试次数"
            ))
            if attempt < max_retries - 1:
                time.sleep(3)
                continue
            raise

        except http_requests.exceptions.ConnectionError as e:
            print(f"  连接失败: {e}")
            if attempt < max_retries - 1:
                time.sleep(5)
                continue
            raise

    raise RuntimeError("API 调用失败，已达最大重试次数")


def _download_image(url: str, save_path: Path, timeout: int = 120) -> None:
    """下载图片并保存到本地。"""
    resp = http_requests.get(url, timeout=timeout, stream=True)
    resp.raise_for_status()
    img = Image.open(BytesIO(resp.content))
    img.save(save_path)


def generate_images(analysis: dict, cfg: dict) -> list[Path]:
    """根据分析结果逐一调用 Qwen-Image API 生成图片，返回保存路径列表。"""
    img_cfg = cfg["image"]
    out_cfg = cfg["output"]

    output_dir = Path(out_cfg.get("dir", "./output"))
    output_dir.mkdir(parents=True, exist_ok=True)

    prefix = out_cfg.get("filename_prefix", "poem")
    image_size = resolve_image_size(img_cfg)
    seed = img_cfg.get("seed", -1)
    prompt_lang = img_cfg.get("prompt_language", "zh")
    images_per_prompt = max(1, min(4, img_cfg.get("images_per_prompt", 1)))

    print(f"\n{'='*60}")
    print("Qwen-Image API 图片生成")
    print(f"模型: {img_cfg.get('model', 'Qwen/Qwen-Image')}")
    print(f"图片尺寸: {image_size}")
    print(f"Prompt 语言: {prompt_lang}")
    if images_per_prompt > 1:
        print(f"每个 prompt 生成 {images_per_prompt} 张图（不同种子）")
    print(f"{'='*60}\n")

    saved_paths = []
    total = len(analysis["images"])

    for i, img_info in enumerate(analysis["images"], 1):
        if prompt_lang == "en" and img_info.get("prompt_en"):
            prompt = img_info["prompt_en"]
        else:
            prompt = img_info["prompt"]

        scene_negative = img_info.get("negative_prompt", "")

        print(f"\n[{i}/{total}] 正在生成: {img_info['scene']}")
        print(f"  画风: {img_info.get('style', '未指定')}")
        print(f"  Prompt({prompt_lang}): {prompt[:120]}...")
        if scene_negative:
            print(f"  Negative: {scene_negative[:100]}")

        for j in range(images_per_prompt):
            variant_offset = i * 100 + j
            if seed >= 0:
                actual_seed = seed + variant_offset
            else:
                actual_seed = (int(time.time() * 1000) % (10**10)) + variant_offset

            suffix = chr(ord("a") + j) if images_per_prompt > 1 else ""
            if images_per_prompt > 1:
                print(f"  --- 第 {j+1}/{images_per_prompt} 张 (seed={actual_seed}) ---")

            start_time = time.time()

            try:
                img_url, returned_seed = _call_image_api(
                    prompt, cfg, seed=actual_seed, negative_prompt=scene_negative
                )
            except Exception as e:
                print(f"  生成失败: {e}")
                continue

            elapsed_api = time.time() - start_time
            print(f"  API 响应完成，耗时 {elapsed_api:.1f}s")

            img_path = output_dir / f"{prefix}_{i:02d}{suffix}.png"
            try:
                _download_image(img_url, img_path)
                saved_paths.append(img_path)
                print(f"  已保存: {img_path}")
            except Exception as e:
                print(f"  图片下载失败: {e}")
                print(f"  URL（1小时内有效）: {img_url}")

        if out_cfg.get("save_prompts", True):
            txt_path = output_dir / f"{prefix}_{i:02d}_prompt.txt"
            prompt_zh = img_info["prompt"]
            prompt_en = img_info.get("prompt_en", "")
            txt_path.write_text(
                f"Scene: {img_info['scene']}\n"
                f"Style: {img_info.get('style', '')}\n"
                f"Description: {img_info['description']}\n"
                f"Prompt(zh): {prompt_zh}\n"
                f"Prompt(en): {prompt_en}\n"
                f"Negative: {scene_negative}\n"
                f"Used({prompt_lang}): {prompt}\n",
                encoding="utf-8",
            )

    return saved_paths


# ---------------------------------------------------------------------------
# 主流程
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="古诗词意境图生成器 — 基于 LLM 分析 + Qwen-Image API 生成"
    )
    parser.add_argument(
        "-c", "--config",
        default="config_qwen.yaml",
        help="配置文件路径（默认: config_qwen.yaml）",
    )
    parser.add_argument(
        "-p", "--poem",
        type=str,
        default=None,
        help="直接传入古诗词文本（如不指定则交互式输入）",
    )
    parser.add_argument(
        "--analyze-only",
        action="store_true",
        help="仅进行 LLM 分析，不生成图片",
    )
    parser.add_argument(
        "-o", "--output",
        type=str,
        default=None,
        help="覆盖输出目录",
    )
    args = parser.parse_args()

    cfg = load_config(args.config)

    if args.output:
        cfg["output"]["dir"] = args.output
    else:
        now = datetime.now()
        date_dir = now.strftime("%Y-%m-%d")
        time_dir = now.strftime("%H-%M-%S")
        cfg["output"]["dir"] = str(
            Path(cfg["output"].get("dir", "./output")) / date_dir / time_dir
        )

    if args.poem:
        poem = args.poem
    else:
        print("请输入古诗词（输入空行结束）：")
        lines = []
        while True:
            line = input()
            if line.strip() == "":
                break
            lines.append(line)
        poem = "\n".join(lines)

    if not poem.strip():
        print("未输入任何内容，退出。")
        sys.exit(0)

    print(f"\n输入的诗词：\n{poem}")

    analysis = analyze_poetry(poem, cfg)
    display_analysis(analysis)

    output_dir = Path(cfg["output"].get("dir", "./output"))
    output_dir.mkdir(parents=True, exist_ok=True)
    analysis_path = output_dir / "analysis.json"
    analysis_path.write_text(
        json.dumps(analysis, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    print(f"分析结果已保存: {analysis_path}")

    if args.analyze_only:
        print("\n已完成分析（--analyze-only 模式），跳过图片生成。")
        return

    saved = generate_images(analysis, cfg)

    print(f"\n{'='*60}")
    print(f"全部完成！共生成 {len(saved)} 幅图片：")
    for p in saved:
        print(f"  {p}")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()