所有功能已完成,运行OK
This commit is contained in:
616
classify_pois.py
Normal file
616
classify_pois.py
Normal file
@@ -0,0 +1,616 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Poem Classification System with LLM API Integration
|
||||
|
||||
基于大模型的中国古代诗词多维度分类系统
|
||||
|
||||
分类维度包括:
|
||||
- 季节时序(四季、节气、时辰)
|
||||
- 题材类型(山水、边塞、咏史、咏物等)
|
||||
- 情感心境(喜怒哀乐、忧思愁绪等)
|
||||
- 景物意象(自然、植物、动物、建筑等)
|
||||
- 哲理思想(儒释道、人生感悟等)
|
||||
- 艺术手法(比兴、用典、对仗等)
|
||||
- 人物关系(送别、怀人、思乡等)
|
||||
- 生活场景(宴饮、耕作、读书等)
|
||||
|
||||
Configuration via environment variables:
|
||||
- LLM_BASE_URL: API base URL (e.g., http://localhost:11434/v1)
|
||||
- LLM_API_KEY: API key (optional for local models)
|
||||
- LLM_MODEL: Model name (e.g., qwen:7b, gpt-4)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import hashlib
|
||||
from typing import Dict, List, Set, Optional
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import threading
|
||||
|
||||
|
||||
def generate_poem_signature(poem_data: Dict) -> str:
|
||||
"""
|
||||
基于标题、作者、内容生成诗词的唯一签名(MD5)
|
||||
用于检测重复诗词
|
||||
"""
|
||||
title = poem_data.get('title', '').strip()
|
||||
author = poem_data.get('author', '').strip()
|
||||
|
||||
content_parts = []
|
||||
if 'paragraphs' in poem_data:
|
||||
for para in poem_data['paragraphs']:
|
||||
if isinstance(para, str):
|
||||
content_parts.append(para.strip())
|
||||
elif isinstance(para, list):
|
||||
content_parts.extend([p.strip() for p in para])
|
||||
if 'content' in poem_data:
|
||||
if isinstance(poem_data['content'], str):
|
||||
content_parts.append(poem_data['content'].strip())
|
||||
elif isinstance(poem_data['content'], list):
|
||||
content_parts.extend([p.strip() for p in poem_data['content']])
|
||||
|
||||
content = '\n'.join(content_parts)
|
||||
signature_str = f"title:{title}|author:{author}|content:{content}"
|
||||
return hashlib.md5(signature_str.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
def load_existing_poems(output_file: str) -> Set[str]:
|
||||
"""
|
||||
从输出文件加载已存在的诗词签名
|
||||
"""
|
||||
existing_signatures = set()
|
||||
|
||||
if not os.path.exists(output_file):
|
||||
return existing_signatures
|
||||
|
||||
try:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if 'title' in data and 'author' in data:
|
||||
sig = generate_poem_signature({
|
||||
'title': data.get('title', ''),
|
||||
'author': data.get('author', ''),
|
||||
'paragraphs': data.get('llm_classification', {}).get('analysis', '').split('\n')
|
||||
if data.get('llm_classification') else []
|
||||
})
|
||||
existing_signatures.add(sig)
|
||||
except json.JSONDecodeError:
|
||||
print(f" Warning: Skipping invalid JSON on line {line_num}")
|
||||
except IOError as e:
|
||||
print(f" Warning: Could not read existing output file: {e}")
|
||||
|
||||
return existing_signatures
|
||||
|
||||
|
||||
def append_to_output_file(output_file: str, data: Dict, lock: threading.Lock):
|
||||
"""
|
||||
线程安全地追加写入输出文件
|
||||
"""
|
||||
with lock:
|
||||
with open(output_file, 'a', encoding='utf-8') as f:
|
||||
f.write(json.dumps(data, ensure_ascii=False) + '\n')
|
||||
|
||||
|
||||
class LLMClassifier:
|
||||
"""
|
||||
基于大模型的诗词分类器
|
||||
|
||||
支持通过 OpenAI 兼容 API 调用本地或远程模型
|
||||
提供 20+ 维度的诗词分类标签
|
||||
"""
|
||||
|
||||
CLASSIFICATION_PROMPT = """你是一位中国古代诗词分类专家。请深入分析以下诗词,并从多个维度进行分类标注。
|
||||
|
||||
【诗词信息】
|
||||
- 标题:{title}
|
||||
- 作者:{author}
|
||||
- 内容:{content}
|
||||
|
||||
【重要说明】
|
||||
- 如果诗词内容是繁体中文,请在输出中包含简体中文版本
|
||||
- 分类标签一律使用简体中文
|
||||
|
||||
【分类维度与标签选项】
|
||||
|
||||
1. 季节(season):["春", "夏", "秋", "冬", "四季", "无明确季节"]
|
||||
2. 节气(solar_terms):24 节气中的具体节气,如 "立春"、"清明"、"冬至" 等,无则空数组
|
||||
3. 时辰(time_of_day):["清晨", "上午", "正午", "下午", "黄昏", "夜晚", "深夜", "黎明", "不明确"]
|
||||
4. 题材类型(genre):["山水田园", "边塞征战", "咏史怀古", "咏物言志", "送别怀人", "思乡怀远",
|
||||
"爱情闺怨", "友情赠答", "羁旅漂泊", "隐逸闲适", "讽喻时事", "节日习俗",
|
||||
"宴饮酬唱", "读书治学", "农耕劳作", "宗教禅理", "其他"]
|
||||
5. 情感基调(emotion_tone):["喜悦欢快", "悲伤哀愁", "愤怒激愤", "忧郁伤感", "孤独寂寞",
|
||||
"宁静淡泊", "豪迈激昂", "思念眷恋", "惆怅失落", "平和超脱", "复杂混合"]
|
||||
6. 具体情感(emotions):从以下选择 3-5 个最贴切的:
|
||||
["喜", "怒", "哀", "乐", "忧", "思", "悲", "恐", "惊", "愁", "恨", "爱",
|
||||
"恋", "盼", "悔", "愧", "傲", "谦", "静", "躁"]
|
||||
7. 景物 - 自然(nature_scenery):["山", "水", "云", "雨", "雪", "风", "雷", "电", "日", "月", "星", "霜", "露", "霞"]
|
||||
8. 景物 - 植物(plants):["松", "竹", "梅", "兰", "菊", "荷", "柳", "桃", "李", "杏", "梨", "枫", "梧桐", "芭蕉", "其他"]
|
||||
9. 景物 - 动物(animals):["鸟", "雁", "燕", "鹊", "蝉", "蛙", "鱼", "龙", "凤", "马", "牛", "羊", "犬", "其他"]
|
||||
10. 景物 - 建筑(buildings):["楼", "阁", "亭", "台", "轩", "榭", "桥", "寺", "塔", "城", "关", "宫", "殿", "院", "其他"]
|
||||
11. 意象关键词(imagery):提取 5-10 个诗中最具代表性的意象词汇(简体中文)
|
||||
12. 哲理思想(philosophy):["儒家思想", "道家思想", "佛家禅理", "人生感悟", "历史兴叹", "自然之道", "无明显哲理"]
|
||||
13. 人生阶段(life_stage):["少年", "青年", "中年", "老年", "不明确"]
|
||||
14. 社会身份(social_role):["士人", "官员", "隐士", "游子", "征人", "商贾", "农夫", "僧道", "闺中", "其他"]
|
||||
15. 写作手法(technique):["比兴", "赋", "对仗", "用典", "借景抒情", "托物言志", "虚实结合", "动静结合", "其他"]
|
||||
16. 修辞手法(rhetoric):["比喻", "拟人", "夸张", "对偶", "排比", "反复", "设问", "反问", "其他"]
|
||||
17. 色彩意象(colors):提取诗中的色彩词,如 ["青", "绿", "红", "白", "黄", "紫", "碧", "翠", "苍", "金"]
|
||||
18. 声音意象(sounds):["钟声", "鼓声", "笛声", "琴声", "风声", "雨声", "鸟鸣", "蝉鸣", "其他"]
|
||||
19. 地理方位(location):["江南", "塞北", "中原", "巴蜀", "关中", "岭南", "吴越", "荆楚", "其他"]
|
||||
20. 节日习俗(festival):["春节", "元宵", "清明", "端午", "七夕", "中秋", "重阳", "除夕", "无"]
|
||||
|
||||
【返回格式】
|
||||
请返回严格的 JSON 格式,结构如下:
|
||||
{{
|
||||
"original_text": ["空山新雨後,天氣晚來秋。", ...], // 原始文本(如果输入是繁体)
|
||||
"simplified_text": ["空山新雨后,天气晚来秋。", ...], // 简体中文版本
|
||||
"season": ["秋"],
|
||||
"solar_terms": ["白露"],
|
||||
"time_of_day": "黄昏",
|
||||
"genre": ["山水田园", "隐逸闲适"],
|
||||
"emotion_tone": "宁静淡泊",
|
||||
"emotions": ["静", "喜", "乐"],
|
||||
"nature_scenery": ["山", "水", "月"],
|
||||
"plants": ["松", "竹"],
|
||||
"animals": ["鸟"],
|
||||
"buildings": [],
|
||||
"imagery": ["空山", "新雨", "明月", "青松"],
|
||||
"philosophy": ["道家思想", "自然之道"],
|
||||
"life_stage": "中年",
|
||||
"social_role": "隐士",
|
||||
"technique": ["借景抒情", "动静结合"],
|
||||
"rhetoric": ["拟人"],
|
||||
"colors": ["青", "白"],
|
||||
"sounds": [],
|
||||
"location": "终南山",
|
||||
"festival": "无",
|
||||
"analysis": "简要分析这首诗的主题思想、艺术特色和情感内涵(100-200 字,使用简体中文)"
|
||||
}}
|
||||
|
||||
【注意事项】
|
||||
- 只返回 JSON,不要有任何其他文字说明
|
||||
- 每个维度根据诗意选择最贴切的标签,可以是 1 个或多个
|
||||
- 如果某个维度没有明确对应,选择"无"、"不明确"或空数组
|
||||
- 意象关键词应从原诗中提取或合理归纳(使用简体中文)
|
||||
- 分析要准确、深入、简洁(使用简体中文)
|
||||
- 如果输入是繁体中文,original_text 和 simplified_text 都要填写
|
||||
- 如果输入已经是简体中文,original_text 和 simplified_text 填写相同内容"""
|
||||
|
||||
def __init__(self, base_url: Optional[str] = None, api_key: Optional[str] = None,
|
||||
model: Optional[str] = None, output_file: Optional[str] = None):
|
||||
"""
|
||||
初始化 LLM 分类器
|
||||
|
||||
Args:
|
||||
base_url: API 基础 URL
|
||||
api_key: API 密钥(本地模型可选)
|
||||
model: 模型名称
|
||||
output_file: 输出文件路径(用于实时写入)
|
||||
"""
|
||||
self.base_url = (base_url or os.getenv('LLM_BASE_URL') or 'https://api.siliconflow.cn/v1').rstrip('/')
|
||||
self.api_key = api_key or os.getenv('LLM_API_KEY') or ''
|
||||
self.model = model or os.getenv('LLM_MODEL') or 'qwen:7b'
|
||||
self.output_file = output_file
|
||||
self.file_lock = threading.Lock() if output_file else None
|
||||
|
||||
def _call_api(self, messages: List[Dict], temperature: float = 0.3, max_retries: int = 3) -> str:
|
||||
"""
|
||||
调用 LLM API,带重试机制
|
||||
|
||||
Args:
|
||||
messages: 消息列表
|
||||
temperature: 温度参数
|
||||
max_retries: 最大重试次数
|
||||
|
||||
Returns:
|
||||
API 返回的文本内容
|
||||
"""
|
||||
url = f"{self.base_url}/chat/completions"
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
|
||||
if self.api_key:
|
||||
headers['Authorization'] = f'Bearer {self.api_key}'
|
||||
|
||||
payload = {
|
||||
'model': self.model,
|
||||
'messages': messages,
|
||||
'temperature': temperature,
|
||||
'stream': False
|
||||
}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
data = json.dumps(payload).encode('utf-8')
|
||||
req = urllib.request.Request(url, data=data, headers=headers, method='POST')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=180) as response:
|
||||
result = json.loads(response.read().decode('utf-8'))
|
||||
return result['choices'][0]['message']['content']
|
||||
|
||||
except urllib.error.URLError as e:
|
||||
print(f" API request failed (attempt {attempt + 1}/{max_retries}): {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return ""
|
||||
except json.JSONDecodeError as e:
|
||||
print(f" Failed to parse API response: {e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f" Unexpected error (attempt {attempt + 1}/{max_retries}): {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return ""
|
||||
|
||||
return ""
|
||||
|
||||
def classify_poem(self, poem_data: Dict, skip_if_exists: bool = False,
|
||||
existing_signatures: Optional[Set[str]] = None) -> Optional[Dict]:
|
||||
"""
|
||||
使用 LLM 对单首诗词进行分类
|
||||
|
||||
Args:
|
||||
poem_data: 诗词数据字典
|
||||
skip_if_exists: 是否跳过已存在的诗词
|
||||
existing_signatures: 已存在的诗词签名集合
|
||||
|
||||
Returns:
|
||||
分类结果字典,失败返回 None
|
||||
"""
|
||||
title = poem_data.get('title', '')
|
||||
author = poem_data.get('author', '')
|
||||
|
||||
content_parts = []
|
||||
if 'paragraphs' in poem_data:
|
||||
for para in poem_data['paragraphs']:
|
||||
if isinstance(para, str):
|
||||
content_parts.append(para)
|
||||
elif isinstance(para, list):
|
||||
content_parts.extend(para)
|
||||
if 'content' in poem_data:
|
||||
if isinstance(poem_data['content'], str):
|
||||
content_parts.append(poem_data['content'])
|
||||
elif isinstance(poem_data['content'], list):
|
||||
content_parts.extend(poem_data['content'])
|
||||
|
||||
content = '\n'.join(content_parts)
|
||||
|
||||
if not title and not content:
|
||||
return None
|
||||
|
||||
# 检查是否已存在
|
||||
if skip_if_exists and existing_signatures is not None:
|
||||
poem_sig = generate_poem_signature(poem_data)
|
||||
if poem_sig in existing_signatures:
|
||||
return None
|
||||
|
||||
prompt = self.CLASSIFICATION_PROMPT.format(
|
||||
title=title or '无题',
|
||||
author=author or '佚名',
|
||||
content=content
|
||||
)
|
||||
|
||||
messages = [
|
||||
{'role': 'system', 'content': '你是一位中国古代诗词分类专家,精通诗词鉴赏和分类,能够准确识别诗词的题材、情感、意象和艺术特色。'},
|
||||
{'role': 'user', 'content': prompt}
|
||||
]
|
||||
|
||||
response = self._call_api(messages, temperature=0.3)
|
||||
|
||||
if not response:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 提取 JSON 部分
|
||||
json_match = re.search(r'\{[\s\S]*\}', response)
|
||||
if json_match:
|
||||
result = json.loads(json_match.group())
|
||||
else:
|
||||
result = json.loads(response)
|
||||
|
||||
# 处理简繁转换结果
|
||||
paragraphs = poem_data.get('paragraphs', [])
|
||||
simplified_paragraphs = []
|
||||
|
||||
# 如果 LLM 返回了简体版本,使用它
|
||||
if 'simplified_text' in result:
|
||||
simplified_paragraphs = result['simplified_text']
|
||||
elif 'original_text' in result:
|
||||
# 如果只有 original_text,说明输入已经是简体
|
||||
simplified_paragraphs = paragraphs
|
||||
else:
|
||||
# 没有文本转换信息,使用原始内容
|
||||
simplified_paragraphs = paragraphs
|
||||
|
||||
classification_result = {
|
||||
'id': poem_data.get('id', 'unknown'),
|
||||
'title': title,
|
||||
'author': author,
|
||||
'paragraphs': simplified_paragraphs, # 使用简体版本
|
||||
'original_paragraphs': paragraphs if paragraphs != simplified_paragraphs else None, # 保留原始(如果不同)
|
||||
'llm_classification': result,
|
||||
'signature': generate_poem_signature({
|
||||
'title': title,
|
||||
'author': author,
|
||||
'paragraphs': simplified_paragraphs
|
||||
}), # 基于简体内容生成签名
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'read_mark': False
|
||||
}
|
||||
|
||||
# 实时写入文件
|
||||
if self.output_file and self.file_lock:
|
||||
append_to_output_file(self.output_file, classification_result, self.file_lock)
|
||||
|
||||
return classification_result
|
||||
except json.JSONDecodeError as e:
|
||||
print(f" Failed to parse LLM response as JSON: {e}")
|
||||
print(f" Raw response: {response[:300]}...")
|
||||
return None
|
||||
|
||||
|
||||
def is_valid_poem_data(poem_data: Dict) -> bool:
|
||||
"""
|
||||
验证诗词数据格式是否有效
|
||||
"""
|
||||
required_fields = {'author', 'paragraphs', 'title'}
|
||||
|
||||
if not isinstance(poem_data, dict):
|
||||
return False
|
||||
|
||||
if not required_fields.issubset(poem_data.keys()):
|
||||
return False
|
||||
|
||||
paragraphs = poem_data.get('paragraphs')
|
||||
if paragraphs is None or not isinstance(paragraphs, list):
|
||||
return False
|
||||
|
||||
for para in paragraphs:
|
||||
if not isinstance(para, str):
|
||||
return False
|
||||
|
||||
if 'id' not in poem_data or not isinstance(poem_data['id'], str):
|
||||
return False
|
||||
|
||||
if not isinstance(poem_data['author'], str) or len(poem_data['author'].strip()) == 0:
|
||||
return False
|
||||
|
||||
if not isinstance(poem_data['title'], str) or len(poem_data['title'].strip()) == 0:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def load_poems_from_file(file_path: str) -> List[Dict]:
|
||||
"""
|
||||
从文件加载诗词数据
|
||||
支持 JSON 数组、单个 JSON 对象、JSONL 三种格式
|
||||
"""
|
||||
poems = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read().strip()
|
||||
|
||||
if not content:
|
||||
return poems
|
||||
|
||||
# 尝试解析为 JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
if isinstance(data, list):
|
||||
poems = data
|
||||
elif isinstance(data, dict):
|
||||
poems = [data]
|
||||
except json.JSONDecodeError:
|
||||
# 按 JSONL 处理
|
||||
for line in content.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if isinstance(obj, dict):
|
||||
poems.append(obj)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except IOError as e:
|
||||
print(f"Error reading file {file_path}: {e}")
|
||||
|
||||
return poems
|
||||
|
||||
|
||||
def main():
|
||||
"""主入口函数"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='基于大模型的中国古代诗词多维度分类系统',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
# 使用本地模型分类(递归搜索子目录)
|
||||
python classify_pois.py ./poems ./output.jsonl 10 --llm \\
|
||||
--base-url http://localhost:11434/v1 --model qwen:7b
|
||||
|
||||
# 使用远程模型分类
|
||||
python classify_pois.py ./poems ./output.jsonl 10 --llm \\
|
||||
--base-url https://api.openai.com/v1 --model gpt-4 --api-key sk-xxx
|
||||
|
||||
# 断点续跑(自动跳过已处理的诗词)
|
||||
python classify_pois.py ./poems ./output.jsonl 10 --llm \\
|
||||
--base-url http://localhost:11434/v1 --model qwen:7b
|
||||
|
||||
# 只扫描当前目录,不递归子目录
|
||||
python classify_pois.py ./poems ./output.jsonl 10 --llm --no-recursive \\
|
||||
--base-url http://localhost:11434/v1 --model qwen:7b
|
||||
|
||||
环境变量(备选):
|
||||
LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input_folder', help='输入文件夹路径(包含诗词 JSON/JSONL 文件)')
|
||||
parser.add_argument('output_file', help='输出 JSONL 文件路径')
|
||||
parser.add_argument('max_workers', nargs='?', type=int, default=10, help='并发数(默认:10)')
|
||||
|
||||
parser.add_argument('--llm', action='store_true', help='使用 LLM 分类(默认)')
|
||||
parser.add_argument('--base-url', type=str, help='LLM API 基础 URL')
|
||||
parser.add_argument('--api-key', type=str, help='LLM API 密钥')
|
||||
parser.add_argument('--model', type=str, help='LLM 模型名称')
|
||||
parser.add_argument('--no-recursive', action='store_true', help='不递归搜索子目录')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 获取配置
|
||||
base_url = args.base_url or os.getenv('LLM_BASE_URL', 'http://localhost:11434/v1')
|
||||
api_key = args.api_key or os.getenv('LLM_API_KEY', '')
|
||||
model = args.model or os.getenv('LLM_MODEL', 'qwen:7b')
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"基于大模型的诗词多维度分类系统")
|
||||
print(f"{'='*70}")
|
||||
print(f"输入目录:{args.input_folder}")
|
||||
print(f"输出文件:{args.output_file}")
|
||||
print(f"API 地址:{base_url}")
|
||||
print(f"模型名称:{model}")
|
||||
print(f"{'='*70}\n")
|
||||
|
||||
# 加载已存在的诗词签名
|
||||
existing_signatures = load_existing_poems(args.output_file)
|
||||
if existing_signatures:
|
||||
print(f"检测到输出文件中已有 {len(existing_signatures)} 首诗词,将自动跳过重复项\n")
|
||||
|
||||
# 创建分类器
|
||||
llm_classifier = LLMClassifier(
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
output_file=args.output_file
|
||||
)
|
||||
|
||||
# 扫描输入文件(支持递归搜索子目录)
|
||||
valid_extensions = {'.json', '.jsonl'}
|
||||
json_files = []
|
||||
|
||||
if args.no_recursive:
|
||||
# 不递归,只扫描当前目录
|
||||
for f in os.listdir(args.input_folder):
|
||||
if not any(f.lower().endswith(ext) for ext in valid_extensions):
|
||||
continue
|
||||
file_path = os.path.join(args.input_folder, f)
|
||||
if not os.path.isfile(file_path):
|
||||
continue
|
||||
json_files.append(file_path)
|
||||
else:
|
||||
# 递归扫描所有子目录
|
||||
for root, dirs, files in os.walk(args.input_folder):
|
||||
# 跳过隐藏目录
|
||||
dirs[:] = [d for d in dirs if not d.startswith('.')]
|
||||
|
||||
for f in files:
|
||||
if not any(f.lower().endswith(ext) for ext in valid_extensions):
|
||||
continue
|
||||
# 跳过隐藏文件
|
||||
if f.startswith('.'):
|
||||
continue
|
||||
file_path = os.path.join(root, f)
|
||||
if os.path.isfile(file_path):
|
||||
json_files.append(file_path)
|
||||
|
||||
# 按文件路径排序,保证处理顺序一致
|
||||
json_files.sort()
|
||||
|
||||
print(f"发现 {len(json_files)} 个有效的 JSON/JSONL 文件\n")
|
||||
|
||||
# 显示目录结构(如果有子目录)
|
||||
subdirs = set(os.path.dirname(f).replace(args.input_folder, '').lstrip('\\').lstrip('/') for f in json_files)
|
||||
if subdirs:
|
||||
print(f"包含子目录:{', '.join(sorted(subdirs))}")
|
||||
|
||||
# 统计信息
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'skipped': 0,
|
||||
'failed': 0,
|
||||
'invalid': 0,
|
||||
'total': 0
|
||||
}
|
||||
stats_lock = threading.Lock()
|
||||
|
||||
def process_poem_batch(file_path: str) -> dict:
|
||||
"""处理单个文件的诗词"""
|
||||
file_stats = {'processed': 0, 'skipped': 0, 'failed': 0, 'invalid': 0}
|
||||
poems = load_poems_from_file(file_path)
|
||||
|
||||
for idx, poem in enumerate(poems):
|
||||
if not is_valid_poem_data(poem):
|
||||
file_stats['invalid'] += 1
|
||||
print(f" [{idx+1}/{len(poems)}] 格式无效:{poem.get('title', 'Unknown')}")
|
||||
continue
|
||||
|
||||
poem_sig = generate_poem_signature(poem)
|
||||
if poem_sig in existing_signatures:
|
||||
file_stats['skipped'] += 1
|
||||
print(f" [{idx+1}/{len(poems)}] 跳过(已存在): {poem.get('title', 'Unknown')}")
|
||||
continue
|
||||
|
||||
print(f" [{idx+1}/{len(poems)}] 分类中:{poem.get('title', 'Unknown')}")
|
||||
result = llm_classifier.classify_poem(poem, skip_if_exists=True, existing_signatures=existing_signatures)
|
||||
|
||||
if result:
|
||||
file_stats['processed'] += 1
|
||||
existing_signatures.add(poem_sig)
|
||||
print(f" ✓ 成功")
|
||||
else:
|
||||
file_stats['failed'] += 1
|
||||
print(f" ✗ 失败")
|
||||
|
||||
return file_stats
|
||||
|
||||
# 使用线程池并发处理
|
||||
print(f"使用 {args.max_workers} 个并发线程处理\n")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
|
||||
# 提交所有文件处理任务
|
||||
future_to_file = {executor.submit(process_poem_batch, fp): fp for fp in json_files}
|
||||
|
||||
# 收集结果
|
||||
for future in as_completed(future_to_file):
|
||||
file_path = future_to_file[future]
|
||||
try:
|
||||
file_stats = future.result()
|
||||
with stats_lock:
|
||||
stats['processed'] += file_stats['processed']
|
||||
stats['skipped'] += file_stats['skipped']
|
||||
stats['failed'] += file_stats['failed']
|
||||
stats['invalid'] += file_stats['invalid']
|
||||
stats['total'] += file_stats['processed'] + file_stats['skipped'] + file_stats['failed'] + file_stats['invalid']
|
||||
except Exception as e:
|
||||
print(f"处理文件 {file_path} 时出错:{e}")
|
||||
|
||||
print()
|
||||
|
||||
# 输出统计
|
||||
print(f"\n{'='*70}")
|
||||
print(f"分类完成!")
|
||||
print(f"{'='*70}")
|
||||
print(f"诗词总数:{stats['total']}")
|
||||
print(f"成功分类:{stats['processed']}")
|
||||
print(f"跳过重复:{stats['skipped']}")
|
||||
print(f"分类失败:{stats['failed']}")
|
||||
print(f"格式无效:{stats['invalid']}")
|
||||
print(f"输出文件:{args.output_file}")
|
||||
print(f"{'='*70}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user