所有功能已完成，运行OK

2026-03-23 22:31:48 +08:00
commit 44fd84d380
8 changed files with 3495 additions and 0 deletions
--- a/classify_pois.py
+++ b/classify_pois.py
@@ -0,0 +1,616 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Poem Classification System with LLM API Integration
+
+基于大模型的中国古代诗词多维度分类系统
+
+分类维度包括:
+- 季节时序（四季、节气、时辰）
+- 题材类型（山水、边塞、咏史、咏物等）
+- 情感心境（喜怒哀乐、忧思愁绪等）
+- 景物意象（自然、植物、动物、建筑等）
+- 哲理思想（儒释道、人生感悟等）
+- 艺术手法（比兴、用典、对仗等）
+- 人物关系（送别、怀人、思乡等）
+- 生活场景（宴饮、耕作、读书等）
+
+Configuration via environment variables:
+- LLM_BASE_URL: API base URL (e.g., http://localhost:11434/v1)
+- LLM_API_KEY: API key (optional for local models)
+- LLM_MODEL: Model name (e.g., qwen:7b, gpt-4)
+"""
+
+import json
+import os
+import re
+import hashlib
+from typing import Dict, List, Set, Optional
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import urllib.request
+import urllib.error
+import threading
+
+
+def generate_poem_signature(poem_data: Dict) -> str:
+    """
+    基于标题、作者、内容生成诗词的唯一签名（MD5）
+    用于检测重复诗词
+    """
+    title = poem_data.get('title', '').strip()
+    author = poem_data.get('author', '').strip()
+    
+    content_parts = []
+    if 'paragraphs' in poem_data:
+        for para in poem_data['paragraphs']:
+            if isinstance(para, str):
+                content_parts.append(para.strip())
+            elif isinstance(para, list):
+                content_parts.extend([p.strip() for p in para])
+    if 'content' in poem_data:
+        if isinstance(poem_data['content'], str):
+            content_parts.append(poem_data['content'].strip())
+        elif isinstance(poem_data['content'], list):
+            content_parts.extend([p.strip() for p in poem_data['content']])
+    
+    content = '\n'.join(content_parts)
+    signature_str = f"title:{title}|author:{author}|content:{content}"
+    return hashlib.md5(signature_str.encode('utf-8')).hexdigest()
+
+
+def load_existing_poems(output_file: str) -> Set[str]:
+    """
+    从输出文件加载已存在的诗词签名
+    """
+    existing_signatures = set()
+    
+    if not os.path.exists(output_file):
+        return existing_signatures
+    
+    try:
+        with open(output_file, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                    if 'title' in data and 'author' in data:
+                        sig = generate_poem_signature({
+                            'title': data.get('title', ''),
+                            'author': data.get('author', ''),
+                            'paragraphs': data.get('llm_classification', {}).get('analysis', '').split('\n') 
+                                        if data.get('llm_classification') else []
+                        })
+                        existing_signatures.add(sig)
+                except json.JSONDecodeError:
+                    print(f"  Warning: Skipping invalid JSON on line {line_num}")
+    except IOError as e:
+        print(f"  Warning: Could not read existing output file: {e}")
+    
+    return existing_signatures
+
+
+def append_to_output_file(output_file: str, data: Dict, lock: threading.Lock):
+    """
+    线程安全地追加写入输出文件
+    """
+    with lock:
+        with open(output_file, 'a', encoding='utf-8') as f:
+            f.write(json.dumps(data, ensure_ascii=False) + '\n')
+
+
+class LLMClassifier:
+    """
+    基于大模型的诗词分类器
+    
+    支持通过 OpenAI 兼容 API 调用本地或远程模型
+    提供 20+ 维度的诗词分类标签
+    """
+    
+    CLASSIFICATION_PROMPT = """你是一位中国古代诗词分类专家。请深入分析以下诗词，并从多个维度进行分类标注。
+
+【诗词信息】
+- 标题：{title}
+- 作者：{author}
+- 内容：{content}
+
+【重要说明】
+- 如果诗词内容是繁体中文，请在输出中包含简体中文版本
+- 分类标签一律使用简体中文
+
+【分类维度与标签选项】
+
+1. 季节（season）：["春", "夏", "秋", "冬", "四季", "无明确季节"]
+2. 节气（solar_terms）：24 节气中的具体节气，如 "立春"、"清明"、"冬至" 等，无则空数组
+3. 时辰（time_of_day）：["清晨", "上午", "正午", "下午", "黄昏", "夜晚", "深夜", "黎明", "不明确"]
+4. 题材类型（genre）：["山水田园", "边塞征战", "咏史怀古", "咏物言志", "送别怀人", "思乡怀远", 
+                      "爱情闺怨", "友情赠答", "羁旅漂泊", "隐逸闲适", "讽喻时事", "节日习俗",
+                      "宴饮酬唱", "读书治学", "农耕劳作", "宗教禅理", "其他"]
+5. 情感基调（emotion_tone）：["喜悦欢快", "悲伤哀愁", "愤怒激愤", "忧郁伤感", "孤独寂寞", 
+                           "宁静淡泊", "豪迈激昂", "思念眷恋", "惆怅失落", "平和超脱", "复杂混合"]
+6. 具体情感（emotions）：从以下选择 3-5 个最贴切的：
+   ["喜", "怒", "哀", "乐", "忧", "思", "悲", "恐", "惊", "愁", "恨", "爱", 
+    "恋", "盼", "悔", "愧", "傲", "谦", "静", "躁"]
+7. 景物 - 自然（nature_scenery）：["山", "水", "云", "雨", "雪", "风", "雷", "电", "日", "月", "星", "霜", "露", "霞"]
+8. 景物 - 植物（plants）：["松", "竹", "梅", "兰", "菊", "荷", "柳", "桃", "李", "杏", "梨", "枫", "梧桐", "芭蕉", "其他"]
+9. 景物 - 动物（animals）：["鸟", "雁", "燕", "鹊", "蝉", "蛙", "鱼", "龙", "凤", "马", "牛", "羊", "犬", "其他"]
+10. 景物 - 建筑（buildings）：["楼", "阁", "亭", "台", "轩", "榭", "桥", "寺", "塔", "城", "关", "宫", "殿", "院", "其他"]
+11. 意象关键词（imagery）：提取 5-10 个诗中最具代表性的意象词汇（简体中文）
+12. 哲理思想（philosophy）：["儒家思想", "道家思想", "佛家禅理", "人生感悟", "历史兴叹", "自然之道", "无明显哲理"]
+13. 人生阶段（life_stage）：["少年", "青年", "中年", "老年", "不明确"]
+14. 社会身份（social_role）：["士人", "官员", "隐士", "游子", "征人", "商贾", "农夫", "僧道", "闺中", "其他"]
+15. 写作手法（technique）：["比兴", "赋", "对仗", "用典", "借景抒情", "托物言志", "虚实结合", "动静结合", "其他"]
+16. 修辞手法（rhetoric）：["比喻", "拟人", "夸张", "对偶", "排比", "反复", "设问", "反问", "其他"]
+17. 色彩意象（colors）：提取诗中的色彩词，如 ["青", "绿", "红", "白", "黄", "紫", "碧", "翠", "苍", "金"]
+18. 声音意象（sounds）：["钟声", "鼓声", "笛声", "琴声", "风声", "雨声", "鸟鸣", "蝉鸣", "其他"]
+19. 地理方位（location）：["江南", "塞北", "中原", "巴蜀", "关中", "岭南", "吴越", "荆楚", "其他"]
+20. 节日习俗（festival）：["春节", "元宵", "清明", "端午", "七夕", "中秋", "重阳", "除夕", "无"]
+
+【返回格式】
+请返回严格的 JSON 格式，结构如下：
+{{
+    "original_text": ["空山新雨後，天氣晚來秋。", ...],  // 原始文本（如果输入是繁体）
+    "simplified_text": ["空山新雨后，天气晚来秋。", ...], // 简体中文版本
+    "season": ["秋"],
+    "solar_terms": ["白露"],
+    "time_of_day": "黄昏",
+    "genre": ["山水田园", "隐逸闲适"],
+    "emotion_tone": "宁静淡泊",
+    "emotions": ["静", "喜", "乐"],
+    "nature_scenery": ["山", "水", "月"],
+    "plants": ["松", "竹"],
+    "animals": ["鸟"],
+    "buildings": [],
+    "imagery": ["空山", "新雨", "明月", "青松"],
+    "philosophy": ["道家思想", "自然之道"],
+    "life_stage": "中年",
+    "social_role": "隐士",
+    "technique": ["借景抒情", "动静结合"],
+    "rhetoric": ["拟人"],
+    "colors": ["青", "白"],
+    "sounds": [],
+    "location": "终南山",
+    "festival": "无",
+    "analysis": "简要分析这首诗的主题思想、艺术特色和情感内涵（100-200 字，使用简体中文）"
+}}
+
+【注意事项】
+- 只返回 JSON，不要有任何其他文字说明
+- 每个维度根据诗意选择最贴切的标签，可以是 1 个或多个
+- 如果某个维度没有明确对应，选择"无"、"不明确"或空数组
+- 意象关键词应从原诗中提取或合理归纳（使用简体中文）
+- 分析要准确、深入、简洁（使用简体中文）
+- 如果输入是繁体中文，original_text 和 simplified_text 都要填写
+- 如果输入已经是简体中文，original_text 和 simplified_text 填写相同内容"""
+
+    def __init__(self, base_url: Optional[str] = None, api_key: Optional[str] = None, 
+                 model: Optional[str] = None, output_file: Optional[str] = None):
+        """
+        初始化 LLM 分类器
+        
+        Args:
+            base_url: API 基础 URL
+            api_key: API 密钥（本地模型可选）
+            model: 模型名称
+            output_file: 输出文件路径（用于实时写入）
+        """
+        self.base_url = (base_url or os.getenv('LLM_BASE_URL') or 'https://api.siliconflow.cn/v1').rstrip('/')
+        self.api_key = api_key or os.getenv('LLM_API_KEY') or ''
+        self.model = model or os.getenv('LLM_MODEL') or 'qwen:7b'
+        self.output_file = output_file
+        self.file_lock = threading.Lock() if output_file else None
+    
+    def _call_api(self, messages: List[Dict], temperature: float = 0.3, max_retries: int = 3) -> str:
+        """
+        调用 LLM API，带重试机制
+        
+        Args:
+            messages: 消息列表
+            temperature: 温度参数
+            max_retries: 最大重试次数
+            
+        Returns:
+            API 返回的文本内容
+        """
+        url = f"{self.base_url}/chat/completions"
+        
+        headers = {
+            'Content-Type': 'application/json',
+        }
+        
+        if self.api_key:
+            headers['Authorization'] = f'Bearer {self.api_key}'
+        
+        payload = {
+            'model': self.model,
+            'messages': messages,
+            'temperature': temperature,
+            'stream': False
+        }
+        
+        for attempt in range(max_retries):
+            try:
+                data = json.dumps(payload).encode('utf-8')
+                req = urllib.request.Request(url, data=data, headers=headers, method='POST')
+                
+                with urllib.request.urlopen(req, timeout=180) as response:
+                    result = json.loads(response.read().decode('utf-8'))
+                    return result['choices'][0]['message']['content']
+                    
+            except urllib.error.URLError as e:
+                print(f"  API request failed (attempt {attempt + 1}/{max_retries}): {e}")
+                if attempt == max_retries - 1:
+                    return ""
+            except json.JSONDecodeError as e:
+                print(f"  Failed to parse API response: {e}")
+                return ""
+            except Exception as e:
+                print(f"  Unexpected error (attempt {attempt + 1}/{max_retries}): {e}")
+                if attempt == max_retries - 1:
+                    return ""
+        
+        return ""
+    
+    def classify_poem(self, poem_data: Dict, skip_if_exists: bool = False, 
+                      existing_signatures: Optional[Set[str]] = None) -> Optional[Dict]:
+        """
+        使用 LLM 对单首诗词进行分类
+        
+        Args:
+            poem_data: 诗词数据字典
+            skip_if_exists: 是否跳过已存在的诗词
+            existing_signatures: 已存在的诗词签名集合
+            
+        Returns:
+            分类结果字典，失败返回 None
+        """
+        title = poem_data.get('title', '')
+        author = poem_data.get('author', '')
+        
+        content_parts = []
+        if 'paragraphs' in poem_data:
+            for para in poem_data['paragraphs']:
+                if isinstance(para, str):
+                    content_parts.append(para)
+                elif isinstance(para, list):
+                    content_parts.extend(para)
+        if 'content' in poem_data:
+            if isinstance(poem_data['content'], str):
+                content_parts.append(poem_data['content'])
+            elif isinstance(poem_data['content'], list):
+                content_parts.extend(poem_data['content'])
+        
+        content = '\n'.join(content_parts)
+        
+        if not title and not content:
+            return None
+        
+        # 检查是否已存在
+        if skip_if_exists and existing_signatures is not None:
+            poem_sig = generate_poem_signature(poem_data)
+            if poem_sig in existing_signatures:
+                return None
+        
+        prompt = self.CLASSIFICATION_PROMPT.format(
+            title=title or '无题',
+            author=author or '佚名',
+            content=content
+        )
+        
+        messages = [
+            {'role': 'system', 'content': '你是一位中国古代诗词分类专家，精通诗词鉴赏和分类，能够准确识别诗词的题材、情感、意象和艺术特色。'},
+            {'role': 'user', 'content': prompt}
+        ]
+        
+        response = self._call_api(messages, temperature=0.3)
+        
+        if not response:
+            return None
+        
+        try:
+            # 提取 JSON 部分
+            json_match = re.search(r'\{[\s\S]*\}', response)
+            if json_match:
+                result = json.loads(json_match.group())
+            else:
+                result = json.loads(response)
+            
+            # 处理简繁转换结果
+            paragraphs = poem_data.get('paragraphs', [])
+            simplified_paragraphs = []
+            
+            # 如果 LLM 返回了简体版本，使用它
+            if 'simplified_text' in result:
+                simplified_paragraphs = result['simplified_text']
+            elif 'original_text' in result:
+                # 如果只有 original_text，说明输入已经是简体
+                simplified_paragraphs = paragraphs
+            else:
+                # 没有文本转换信息，使用原始内容
+                simplified_paragraphs = paragraphs
+            
+            classification_result = {
+                'id': poem_data.get('id', 'unknown'),
+                'title': title,
+                'author': author,
+                'paragraphs': simplified_paragraphs,  # 使用简体版本
+                'original_paragraphs': paragraphs if paragraphs != simplified_paragraphs else None,  # 保留原始（如果不同）
+                'llm_classification': result,
+                'signature': generate_poem_signature({
+                    'title': title,
+                    'author': author,
+                    'paragraphs': simplified_paragraphs
+                }),  # 基于简体内容生成签名
+                'timestamp': datetime.now().isoformat(),
+                'read_mark': False
+            }
+            
+            # 实时写入文件
+            if self.output_file and self.file_lock:
+                append_to_output_file(self.output_file, classification_result, self.file_lock)
+            
+            return classification_result
+        except json.JSONDecodeError as e:
+            print(f"  Failed to parse LLM response as JSON: {e}")
+            print(f"  Raw response: {response[:300]}...")
+            return None
+
+
+def is_valid_poem_data(poem_data: Dict) -> bool:
+    """
+    验证诗词数据格式是否有效
+    """
+    required_fields = {'author', 'paragraphs', 'title'}
+    
+    if not isinstance(poem_data, dict):
+        return False
+    
+    if not required_fields.issubset(poem_data.keys()):
+        return False
+    
+    paragraphs = poem_data.get('paragraphs')
+    if paragraphs is None or not isinstance(paragraphs, list):
+        return False
+    
+    for para in paragraphs:
+        if not isinstance(para, str):
+            return False
+    
+    if 'id' not in poem_data or not isinstance(poem_data['id'], str):
+        return False
+    
+    if not isinstance(poem_data['author'], str) or len(poem_data['author'].strip()) == 0:
+        return False
+    
+    if not isinstance(poem_data['title'], str) or len(poem_data['title'].strip()) == 0:
+        return False
+    
+    return True
+
+
+def load_poems_from_file(file_path: str) -> List[Dict]:
+    """
+    从文件加载诗词数据
+    支持 JSON 数组、单个 JSON 对象、JSONL 三种格式
+    """
+    poems = []
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+        
+        if not content:
+            return poems
+        
+        # 尝试解析为 JSON
+        try:
+            data = json.loads(content)
+            if isinstance(data, list):
+                poems = data
+            elif isinstance(data, dict):
+                poems = [data]
+        except json.JSONDecodeError:
+            # 按 JSONL 处理
+            for line in content.splitlines():
+                line = line.strip()
+                if line:
+                    try:
+                        obj = json.loads(line)
+                        if isinstance(obj, dict):
+                            poems.append(obj)
+                    except json.JSONDecodeError:
+                        continue
+    except IOError as e:
+        print(f"Error reading file {file_path}: {e}")
+    
+    return poems
+
+
+def main():
+    """主入口函数"""
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description='基于大模型的中国古代诗词多维度分类系统',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  # 使用本地模型分类（递归搜索子目录）
+  python classify_pois.py ./poems ./output.jsonl 10 --llm \\
+    --base-url http://localhost:11434/v1 --model qwen:7b
+  
+  # 使用远程模型分类
+  python classify_pois.py ./poems ./output.jsonl 10 --llm \\
+    --base-url https://api.openai.com/v1 --model gpt-4 --api-key sk-xxx
+
+  # 断点续跑（自动跳过已处理的诗词）
+  python classify_pois.py ./poems ./output.jsonl 10 --llm \\
+    --base-url http://localhost:11434/v1 --model qwen:7b
+  
+  # 只扫描当前目录，不递归子目录
+  python classify_pois.py ./poems ./output.jsonl 10 --llm --no-recursive \\
+    --base-url http://localhost:11434/v1 --model qwen:7b
+
+环境变量（备选）:
+  LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
+        """
+    )
+    
+    parser.add_argument('input_folder', help='输入文件夹路径（包含诗词 JSON/JSONL 文件）')
+    parser.add_argument('output_file', help='输出 JSONL 文件路径')
+    parser.add_argument('max_workers', nargs='?', type=int, default=10, help='并发数（默认：10）')
+    
+    parser.add_argument('--llm', action='store_true', help='使用 LLM 分类（默认）')
+    parser.add_argument('--base-url', type=str, help='LLM API 基础 URL')
+    parser.add_argument('--api-key', type=str, help='LLM API 密钥')
+    parser.add_argument('--model', type=str, help='LLM 模型名称')
+    parser.add_argument('--no-recursive', action='store_true', help='不递归搜索子目录')
+    
+    args = parser.parse_args()
+    
+    # 获取配置
+    base_url = args.base_url or os.getenv('LLM_BASE_URL', 'http://localhost:11434/v1')
+    api_key = args.api_key or os.getenv('LLM_API_KEY', '')
+    model = args.model or os.getenv('LLM_MODEL', 'qwen:7b')
+    
+    print(f"\n{'='*70}")
+    print(f"基于大模型的诗词多维度分类系统")
+    print(f"{'='*70}")
+    print(f"输入目录：{args.input_folder}")
+    print(f"输出文件：{args.output_file}")
+    print(f"API 地址：{base_url}")
+    print(f"模型名称：{model}")
+    print(f"{'='*70}\n")
+    
+    # 加载已存在的诗词签名
+    existing_signatures = load_existing_poems(args.output_file)
+    if existing_signatures:
+        print(f"检测到输出文件中已有 {len(existing_signatures)} 首诗词，将自动跳过重复项\n")
+    
+    # 创建分类器
+    llm_classifier = LLMClassifier(
+        base_url=base_url,
+        api_key=api_key,
+        model=model,
+        output_file=args.output_file
+    )
+    
+    # 扫描输入文件（支持递归搜索子目录）
+    valid_extensions = {'.json', '.jsonl'}
+    json_files = []
+    
+    if args.no_recursive:
+        # 不递归，只扫描当前目录
+        for f in os.listdir(args.input_folder):
+            if not any(f.lower().endswith(ext) for ext in valid_extensions):
+                continue
+            file_path = os.path.join(args.input_folder, f)
+            if not os.path.isfile(file_path):
+                continue
+            json_files.append(file_path)
+    else:
+        # 递归扫描所有子目录
+        for root, dirs, files in os.walk(args.input_folder):
+            # 跳过隐藏目录
+            dirs[:] = [d for d in dirs if not d.startswith('.')]
+            
+            for f in files:
+                if not any(f.lower().endswith(ext) for ext in valid_extensions):
+                    continue
+                # 跳过隐藏文件
+                if f.startswith('.'):
+                    continue
+                file_path = os.path.join(root, f)
+                if os.path.isfile(file_path):
+                    json_files.append(file_path)
+    
+    # 按文件路径排序，保证处理顺序一致
+    json_files.sort()
+    
+    print(f"发现 {len(json_files)} 个有效的 JSON/JSONL 文件\n")
+    
+    # 显示目录结构（如果有子目录）
+    subdirs = set(os.path.dirname(f).replace(args.input_folder, '').lstrip('\\').lstrip('/') for f in json_files)
+    if subdirs:
+        print(f"包含子目录：{', '.join(sorted(subdirs))}")
+    
+    # 统计信息
+    stats = {
+        'processed': 0,
+        'skipped': 0,
+        'failed': 0,
+        'invalid': 0,
+        'total': 0
+    }
+    stats_lock = threading.Lock()
+    
+    def process_poem_batch(file_path: str) -> dict:
+        """处理单个文件的诗词"""
+        file_stats = {'processed': 0, 'skipped': 0, 'failed': 0, 'invalid': 0}
+        poems = load_poems_from_file(file_path)
+        
+        for idx, poem in enumerate(poems):
+            if not is_valid_poem_data(poem):
+                file_stats['invalid'] += 1
+                print(f"  [{idx+1}/{len(poems)}] 格式无效：{poem.get('title', 'Unknown')}")
+                continue
+            
+            poem_sig = generate_poem_signature(poem)
+            if poem_sig in existing_signatures:
+                file_stats['skipped'] += 1
+                print(f"  [{idx+1}/{len(poems)}] 跳过（已存在）: {poem.get('title', 'Unknown')}")
+                continue
+            
+            print(f"  [{idx+1}/{len(poems)}] 分类中：{poem.get('title', 'Unknown')}")
+            result = llm_classifier.classify_poem(poem, skip_if_exists=True, existing_signatures=existing_signatures)
+            
+            if result:
+                file_stats['processed'] += 1
+                existing_signatures.add(poem_sig)
+                print(f"    ✓ 成功")
+            else:
+                file_stats['failed'] += 1
+                print(f"    ✗ 失败")
+        
+        return file_stats
+    
+    # 使用线程池并发处理
+    print(f"使用 {args.max_workers} 个并发线程处理\n")
+    
+    with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
+        # 提交所有文件处理任务
+        future_to_file = {executor.submit(process_poem_batch, fp): fp for fp in json_files}
+        
+        # 收集结果
+        for future in as_completed(future_to_file):
+            file_path = future_to_file[future]
+            try:
+                file_stats = future.result()
+                with stats_lock:
+                    stats['processed'] += file_stats['processed']
+                    stats['skipped'] += file_stats['skipped']
+                    stats['failed'] += file_stats['failed']
+                    stats['invalid'] += file_stats['invalid']
+                    stats['total'] += file_stats['processed'] + file_stats['skipped'] + file_stats['failed'] + file_stats['invalid']
+            except Exception as e:
+                print(f"处理文件 {file_path} 时出错：{e}")
+        
+        print()
+    
+    # 输出统计
+    print(f"\n{'='*70}")
+    print(f"分类完成!")
+    print(f"{'='*70}")
+    print(f"诗词总数：{stats['total']}")
+    print(f"成功分类：{stats['processed']}")
+    print(f"跳过重复：{stats['skipped']}")
+    print(f"分类失败：{stats['failed']}")
+    print(f"格式无效：{stats['invalid']}")
+    print(f"输出文件：{args.output_file}")
+    print(f"{'='*70}\n")
+
+
+if __name__ == "__main__":
+    main()