nodebookls/content_analyzer.py
2025-10-29 13:56:24 +08:00

165 lines
4.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
内容分析模块
提供自动关键词提取、文本摘要和标签生成功能
"""
import jieba
import jieba.analyse
from collections import Counter
from typing import List, Tuple, Dict
from config import settings
from model_manager import model_router
from exceptions import GenerationError
from cache_manager import cache_manager
import hashlib
class ContentAnalyzer:
def __init__(self):
# 初始化jieba
jieba.initialize()
# 使用模型路由器,不需要单独初始化客户端
self.client = None
def extract_keywords(self, text: str, top_k: int = 3) -> List[str]:
"""
提取文本关键词
Args:
text: 输入文本
top_k: 返回关键词数量
Returns:
关键词列表
"""
# 使用jieba的TF-IDF算法提取关键词
keywords = jieba.analyse.extract_tags(text, topK=top_k, withWeight=False)
return keywords
def generate_summary(self, text: str, sentences: int = 3) -> str:
"""
生成文本摘要
Args:
text: 输入文本
sentences: 摘要句子数
Returns:
文本摘要
"""
# 生成缓存键
cache_key = f"summary:{hashlib.md5(f'{text}:{sentences}'.encode()).hexdigest()}"
# 尝试从缓存获取结果
cached_result = cache_manager.get(cache_key)
if cached_result is not None:
return cached_result
# 如果没有配置模型提供商,使用简单的句子抽取方法
if not model_router.providers:
result = self._simple_summary(text, sentences)
# 缓存结果
cache_manager.put(cache_key, result)
return result
try:
# 使用OpenAI API生成摘要
prompt = f"""请为以下文本生成一个{sentences}句话的摘要:
{text}
要求:
1. 保留最重要的信息
2. 语言简洁明了
3. 不超过{sentences}句话
"""
summary = model_router.generate_text(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个专业的文本摘要生成器。"},
{"role": "user", "content": prompt}
],
max_tokens=200,
temperature=0.3
)
result = summary.strip() if summary else ""
# 缓存结果
cache_manager.put(cache_key, result)
return result
except Exception as e:
# 如果API调用失败回退到简单方法
print(f"OpenAI API调用失败: {e}")
result = self._simple_summary(text, sentences)
# 缓存结果
cache_manager.put(cache_key, result)
return result
def _simple_summary(self, text: str, sentences: int = 3) -> str:
"""
简单的文本摘要方法(基于句子长度和位置)
Args:
text: 输入文本
sentences: 摘要句子数
Returns:
文本摘要
"""
# 简单的句子分割
import re
sentence_endings = r'[。!?.!?]'
sentences_list = re.split(sentence_endings, text)
sentences_list = [s.strip() for s in sentences_list if s.strip()]
if len(sentences_list) <= sentences:
return text
# 选择前几句作为摘要
summary_sentences = sentences_list[:sentences]
return ''.join(summary_sentences) + ''
def generate_tags(self, text: str, top_k: int = 3) -> List[str]:
"""
生成文本标签
Args:
text: 输入文本
top_k: 返回标签数量
Returns:
标签列表
"""
# 提取关键词作为标签
keywords = self.extract_keywords(text, top_k)
# 可以添加更多标签生成逻辑
# 例如基于词性标注、实体识别等
return keywords
def analyze_content(self, text: str) -> Dict:
"""
综合分析文本内容
Args:
text: 输入文本
Returns:
包含关键词、摘要和标签的字典
"""
keywords = self.extract_keywords(text, 3)
summary = self.generate_summary(text, 3)
tags = self.generate_tags(text, 3)
return {
"keywords": keywords,
"summary": summary,
"tags": tags
}
# 创建全局实例
analyzer = ContentAnalyzer()