165 lines
4.7 KiB
Python
165 lines
4.7 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
内容分析模块
|
|||
|
|
提供自动关键词提取、文本摘要和标签生成功能
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import jieba
|
|||
|
|
import jieba.analyse
|
|||
|
|
from collections import Counter
|
|||
|
|
from typing import List, Tuple, Dict
|
|||
|
|
from config import settings
|
|||
|
|
from model_manager import model_router
|
|||
|
|
from exceptions import GenerationError
|
|||
|
|
from cache_manager import cache_manager
|
|||
|
|
import hashlib
|
|||
|
|
|
|||
|
|
class ContentAnalyzer:
|
|||
|
|
def __init__(self):
|
|||
|
|
# 初始化jieba
|
|||
|
|
jieba.initialize()
|
|||
|
|
|
|||
|
|
# 使用模型路由器,不需要单独初始化客户端
|
|||
|
|
self.client = None
|
|||
|
|
|
|||
|
|
def extract_keywords(self, text: str, top_k: int = 3) -> List[str]:
|
|||
|
|
"""
|
|||
|
|
提取文本关键词
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
text: 输入文本
|
|||
|
|
top_k: 返回关键词数量
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
关键词列表
|
|||
|
|
"""
|
|||
|
|
# 使用jieba的TF-IDF算法提取关键词
|
|||
|
|
keywords = jieba.analyse.extract_tags(text, topK=top_k, withWeight=False)
|
|||
|
|
return keywords
|
|||
|
|
|
|||
|
|
def generate_summary(self, text: str, sentences: int = 3) -> str:
|
|||
|
|
"""
|
|||
|
|
生成文本摘要
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
text: 输入文本
|
|||
|
|
sentences: 摘要句子数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文本摘要
|
|||
|
|
"""
|
|||
|
|
# 生成缓存键
|
|||
|
|
cache_key = f"summary:{hashlib.md5(f'{text}:{sentences}'.encode()).hexdigest()}"
|
|||
|
|
|
|||
|
|
# 尝试从缓存获取结果
|
|||
|
|
cached_result = cache_manager.get(cache_key)
|
|||
|
|
if cached_result is not None:
|
|||
|
|
return cached_result
|
|||
|
|
|
|||
|
|
# 如果没有配置模型提供商,使用简单的句子抽取方法
|
|||
|
|
if not model_router.providers:
|
|||
|
|
result = self._simple_summary(text, sentences)
|
|||
|
|
# 缓存结果
|
|||
|
|
cache_manager.put(cache_key, result)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 使用OpenAI API生成摘要
|
|||
|
|
prompt = f"""请为以下文本生成一个{sentences}句话的摘要:
|
|||
|
|
{text}
|
|||
|
|
|
|||
|
|
要求:
|
|||
|
|
1. 保留最重要的信息
|
|||
|
|
2. 语言简洁明了
|
|||
|
|
3. 不超过{sentences}句话
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
summary = model_router.generate_text(
|
|||
|
|
model="gpt-3.5-turbo",
|
|||
|
|
messages=[
|
|||
|
|
{"role": "system", "content": "你是一个专业的文本摘要生成器。"},
|
|||
|
|
{"role": "user", "content": prompt}
|
|||
|
|
],
|
|||
|
|
max_tokens=200,
|
|||
|
|
temperature=0.3
|
|||
|
|
)
|
|||
|
|
result = summary.strip() if summary else ""
|
|||
|
|
# 缓存结果
|
|||
|
|
cache_manager.put(cache_key, result)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
# 如果API调用失败,回退到简单方法
|
|||
|
|
print(f"OpenAI API调用失败: {e}")
|
|||
|
|
result = self._simple_summary(text, sentences)
|
|||
|
|
# 缓存结果
|
|||
|
|
cache_manager.put(cache_key, result)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
def _simple_summary(self, text: str, sentences: int = 3) -> str:
|
|||
|
|
"""
|
|||
|
|
简单的文本摘要方法(基于句子长度和位置)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
text: 输入文本
|
|||
|
|
sentences: 摘要句子数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文本摘要
|
|||
|
|
"""
|
|||
|
|
# 简单的句子分割
|
|||
|
|
import re
|
|||
|
|
sentence_endings = r'[。!?.!?]'
|
|||
|
|
sentences_list = re.split(sentence_endings, text)
|
|||
|
|
sentences_list = [s.strip() for s in sentences_list if s.strip()]
|
|||
|
|
|
|||
|
|
if len(sentences_list) <= sentences:
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
# 选择前几句作为摘要
|
|||
|
|
summary_sentences = sentences_list[:sentences]
|
|||
|
|
return '。'.join(summary_sentences) + '。'
|
|||
|
|
|
|||
|
|
def generate_tags(self, text: str, top_k: int = 3) -> List[str]:
|
|||
|
|
"""
|
|||
|
|
生成文本标签
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
text: 输入文本
|
|||
|
|
top_k: 返回标签数量
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
标签列表
|
|||
|
|
"""
|
|||
|
|
# 提取关键词作为标签
|
|||
|
|
keywords = self.extract_keywords(text, top_k)
|
|||
|
|
|
|||
|
|
# 可以添加更多标签生成逻辑
|
|||
|
|
# 例如基于词性标注、实体识别等
|
|||
|
|
|
|||
|
|
return keywords
|
|||
|
|
|
|||
|
|
def analyze_content(self, text: str) -> Dict:
|
|||
|
|
"""
|
|||
|
|
综合分析文本内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
text: 输入文本
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
包含关键词、摘要和标签的字典
|
|||
|
|
"""
|
|||
|
|
keywords = self.extract_keywords(text, 3)
|
|||
|
|
summary = self.generate_summary(text, 3)
|
|||
|
|
tags = self.generate_tags(text, 3)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"keywords": keywords,
|
|||
|
|
"summary": summary,
|
|||
|
|
"tags": tags
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 创建全局实例
|
|||
|
|
analyzer = ContentAnalyzer()
|