165 lines
4.7 KiB
Python
165 lines
4.7 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
内容分析模块
|
||
提供自动关键词提取、文本摘要和标签生成功能
|
||
"""
|
||
|
||
import jieba
|
||
import jieba.analyse
|
||
from collections import Counter
|
||
from typing import List, Tuple, Dict
|
||
from config import settings
|
||
from model_manager import model_router
|
||
from exceptions import GenerationError
|
||
from cache_manager import cache_manager
|
||
import hashlib
|
||
|
||
class ContentAnalyzer:
|
||
def __init__(self):
|
||
# 初始化jieba
|
||
jieba.initialize()
|
||
|
||
# 使用模型路由器,不需要单独初始化客户端
|
||
self.client = None
|
||
|
||
def extract_keywords(self, text: str, top_k: int = 3) -> List[str]:
|
||
"""
|
||
提取文本关键词
|
||
|
||
Args:
|
||
text: 输入文本
|
||
top_k: 返回关键词数量
|
||
|
||
Returns:
|
||
关键词列表
|
||
"""
|
||
# 使用jieba的TF-IDF算法提取关键词
|
||
keywords = jieba.analyse.extract_tags(text, topK=top_k, withWeight=False)
|
||
return keywords
|
||
|
||
def generate_summary(self, text: str, sentences: int = 3) -> str:
|
||
"""
|
||
生成文本摘要
|
||
|
||
Args:
|
||
text: 输入文本
|
||
sentences: 摘要句子数
|
||
|
||
Returns:
|
||
文本摘要
|
||
"""
|
||
# 生成缓存键
|
||
cache_key = f"summary:{hashlib.md5(f'{text}:{sentences}'.encode()).hexdigest()}"
|
||
|
||
# 尝试从缓存获取结果
|
||
cached_result = cache_manager.get(cache_key)
|
||
if cached_result is not None:
|
||
return cached_result
|
||
|
||
# 如果没有配置模型提供商,使用简单的句子抽取方法
|
||
if not model_router.providers:
|
||
result = self._simple_summary(text, sentences)
|
||
# 缓存结果
|
||
cache_manager.put(cache_key, result)
|
||
return result
|
||
|
||
try:
|
||
# 使用OpenAI API生成摘要
|
||
prompt = f"""请为以下文本生成一个{sentences}句话的摘要:
|
||
{text}
|
||
|
||
要求:
|
||
1. 保留最重要的信息
|
||
2. 语言简洁明了
|
||
3. 不超过{sentences}句话
|
||
"""
|
||
|
||
summary = model_router.generate_text(
|
||
model="gpt-3.5-turbo",
|
||
messages=[
|
||
{"role": "system", "content": "你是一个专业的文本摘要生成器。"},
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
max_tokens=200,
|
||
temperature=0.3
|
||
)
|
||
result = summary.strip() if summary else ""
|
||
# 缓存结果
|
||
cache_manager.put(cache_key, result)
|
||
return result
|
||
|
||
except Exception as e:
|
||
# 如果API调用失败,回退到简单方法
|
||
print(f"OpenAI API调用失败: {e}")
|
||
result = self._simple_summary(text, sentences)
|
||
# 缓存结果
|
||
cache_manager.put(cache_key, result)
|
||
return result
|
||
|
||
def _simple_summary(self, text: str, sentences: int = 3) -> str:
|
||
"""
|
||
简单的文本摘要方法(基于句子长度和位置)
|
||
|
||
Args:
|
||
text: 输入文本
|
||
sentences: 摘要句子数
|
||
|
||
Returns:
|
||
文本摘要
|
||
"""
|
||
# 简单的句子分割
|
||
import re
|
||
sentence_endings = r'[。!?.!?]'
|
||
sentences_list = re.split(sentence_endings, text)
|
||
sentences_list = [s.strip() for s in sentences_list if s.strip()]
|
||
|
||
if len(sentences_list) <= sentences:
|
||
return text
|
||
|
||
# 选择前几句作为摘要
|
||
summary_sentences = sentences_list[:sentences]
|
||
return '。'.join(summary_sentences) + '。'
|
||
|
||
def generate_tags(self, text: str, top_k: int = 3) -> List[str]:
|
||
"""
|
||
生成文本标签
|
||
|
||
Args:
|
||
text: 输入文本
|
||
top_k: 返回标签数量
|
||
|
||
Returns:
|
||
标签列表
|
||
"""
|
||
# 提取关键词作为标签
|
||
keywords = self.extract_keywords(text, top_k)
|
||
|
||
# 可以添加更多标签生成逻辑
|
||
# 例如基于词性标注、实体识别等
|
||
|
||
return keywords
|
||
|
||
def analyze_content(self, text: str) -> Dict:
|
||
"""
|
||
综合分析文本内容
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
包含关键词、摘要和标签的字典
|
||
"""
|
||
keywords = self.extract_keywords(text, 3)
|
||
summary = self.generate_summary(text, 3)
|
||
tags = self.generate_tags(text, 3)
|
||
|
||
return {
|
||
"keywords": keywords,
|
||
"summary": summary,
|
||
"tags": tags
|
||
}
|
||
|
||
# 创建全局实例
|
||
analyzer = ContentAnalyzer() |