Files
ArticleReplaceBatch/utils.py
2026-03-25 15:17:18 +08:00

201 lines
6.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import re
import pandas as pd
import requests
from config import *
def text_detection(text):
"""
百度检验文字是否违规
:param text:
:return:
"""
url = "https://aip.baidubce.com/rest/2.0/solution/v1/text_censor/v2/user_defined?access_token=" + get_baidu_access_token()
payload = 'text=' + text
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
content = str(response.text)
data = json.loads(content)
print(data)
# 安全地获取 conclusion 字段,如果不存在则返回默认值
conclusion = data.get('conclusion', '合规')
return conclusion
def get_baidu_access_token():
"""
使用 AKSK 生成鉴权签名Access Token百度信息获取
:return: access_token或是None(如果错误)
"""
# 从实时配置中获取API密钥
API_KEY = CONFIG['Baidu']['api_key']
SECRET_KEY = CONFIG['Baidu']['secret_key']
url = "https://aip.baidubce.com/oauth/2.0/token"
params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
return str(requests.post(url, params=params).json().get("access_token"))
def safe_filename(filename):
"""
处理文件名,移除或替换不安全的字符
"""
# 替换Windows文件系统中不允许的字符
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
# 去除首尾空格和点
filename = filename.strip('. ')
# 如果文件名为空,使用默认名称
if not filename:
filename = 'untitled'
return filename
def safe_open_directory(directory_path, base_path=None):
"""
安全创建目录,确保路径格式正确并创建所有必要的父目录
修复了路径遍历安全问题
"""
try:
# 规范化路径
directory_path = os.path.normpath(directory_path)
# 防止路径遍历攻击
if base_path:
# 确保目录路径在基础路径范围内
abs_base = os.path.abspath(base_path)
abs_dir = os.path.abspath(directory_path)
# 检查是否为安全路径(在基础路径或其子目录中)
common_path = os.path.commonpath([abs_base, abs_dir])
if common_path != abs_base and not abs_dir.startswith(abs_base + os.sep):
raise ValueError(f"目录路径不安全: {directory_path}")
# 检查路径是否包含危险字符
dangerous_chars = ['..', '~', '$', '`', ';', '&', '|', '<', '>', '(', ')']
for char in dangerous_chars:
if char in directory_path:
raise ValueError(f"目录路径包含不安全字符: {char}")
if not os.path.exists(directory_path):
# 创建目录设置安全的权限不设置为777
os.makedirs(directory_path, exist_ok=True)
# 设置适当的权限755用于目录
os.chmod(directory_path, 0o755)
except Exception as e:
# 打印日志并保存到日志文件中
logging.error(f"创建目录失败: {e}")
raise
def check_keywords_in_text(text):
"""
检查文本中是否包含违禁词
:param text:
:return:
"""
# 从实时配置中获取违禁词列表
banned_words_str = CONFIG['Keywords']['banned_words']
if not banned_words_str or not banned_words_str.strip():
# 如果违禁词列表为空直接返回False
return False
keywords = [k.strip() for k in banned_words_str.split(',') if k.strip()]
for keyword in keywords:
if keyword in text:
return True
return False
def extract_content_until_punctuation(text, punctuations=r'[,。!?;]'):
"""
截取一段话中从开始到最近的标点符号的内容。
:param text: 输入的文本
:param punctuations: 标点符号的正则表达式模式,默认为""""""""""
:return: 截取的内容
"""
# 使用正则表达式查找标点符号的位置
match = re.search(punctuations, text)
if match:
# 如果找到标点符号,截取从开始到标点符号之前的部分
return text[:match.end()].strip()
else:
# 如果没有找到标点符号,返回整个文本
return text.strip()
# 读取Excel表格链接列和类型列的内容并将内容以元组列表的形式返回
def read_excel(file_name):
datas = pd.read_excel(file_name)
first_column_name = datas.columns[0] # 链接列
type_column_name = '领域' # 类型列
links = datas[first_column_name].tolist()
# 如果存在类型列就读取,不存在则为默认类型
types = datas[type_column_name].tolist() if type_column_name in datas.columns else ['默认'] * len(links)
# 将链接和类型组合成元组列表
result = list(zip(links, types))
print(result)
return result
from typing import Tuple
def handle_duplicate_files_advanced(folder_path: str, filename: str) -> Tuple[str, bool]:
"""
增强版:处理文件夹中的同名文件,支持更复杂的场景
参数:
folder_path: 文件夹路径
filename: 原始文件名
返回:
Tuple[str, bool]: (处理后的文件名, 是否是重命名的)
"""
# 首先处理文件名中的非法字符
filename = safe_filename(filename)
base, ext = os.path.splitext(filename)
target_path = os.path.join(folder_path, filename)
if not os.path.exists(target_path):
return filename, False
existing_files = set(os.listdir(folder_path))
pattern = re.compile(r'^{}(_(\d+))?{}$'.format(re.escape(base), re.escape(ext)))
# 找出所有匹配的文件并提取数字
numbers = []
for f in existing_files:
match = pattern.match(f)
if match:
num = int(match.group(2)) if match.group(2) else 0
numbers.append(num)
next_num = max(numbers) + 1 if numbers else 1
new_filename = f"{base}_{next_num}{ext}"
# 确保新文件名也不存在(处理并发情况)
while new_filename in existing_files:
next_num += 1
new_filename = f"{base}_{next_num}{ext}"
return new_filename, True