ArticleReplaceBatch/main_process.py

import threading
import queue
import json

from ai_studio import call_coze_article_workflow, call_coze_all_article_workflow

from images_edit import download_and_process_images
from utils import *
from get_web_content import *
from config import *
from plagiarismdetecto import FinalPlagiarismDetector


# ==============================主程序===========================
def process_link(link_info, ai_service, current_template=None, generation_type=None, app=None):
    # 确保 link_info 是元组或列表
    if isinstance(link_info, (tuple, list)) and len(link_info) >= 2:
        link, article_type = link_info
    else:
        link = str(link_info) if not isinstance(link_info, str) else link_info
        article_type = generation_type or "未分类"

    # 初始化变量
    original_config = None
    message_content = ""
    title = ""

    try:
        # 使用优化后的提取函数，支持重试和多重备用方案
        if isinstance(link, str) and link.startswith("https://www.toutiao.com"):
            logger.info(f"正在提取头条文章内容: {link}")
            title_text, article_text, img_urls = extract_content_with_retry(link, max_retries=3)
            logger.info(f"提取完成，标题长度: {len(title_text)}, 内容长度: {len(article_text)}, 图片数: {len(img_urls)}")
        elif isinstance(link, str) and link.startswith("https://mp.weixin.qq.co"):
            logger.info(f"正在提取微信文章内容: {link}")
            title_text, article_text, img_urls = extract_content_with_retry(link, max_retries=3)
            logger.info(f"提取完成，标题长度: {len(title_text)}, 内容长度: {len(article_text)}, 图片数: {len(img_urls)}")
        elif isinstance(link, str) and link.startswith("https://www.163.com"):
            logger.info(f"正在提取网易文章内容: {link}")
            title_text, article_text, img_urls = extract_content_with_retry(link, max_retries=3)
            logger.info(f"提取完成，标题长度: {len(title_text)}, 内容长度: {len(article_text)}, 图片数: {len(img_urls)}")
        else:
            logger.warning(f"不支持的链接格式: {link}")
            title_text, article_text, img_urls = "", "", []

        # 验证提取结果
        if not title_text:
            logger.warning(f"文章标题为空，跳过处理: {link}")
            return
        elif len(title_text) > 100:
            logger.warning(f"文章标题过长，跳过处理: {link}")
            return

        # 检查文章字数是否低于最小阈值
        min_article_length = int(CONFIG['General'].get('min_article_length', '100'))
        if len(article_text) < min_article_length:
            logger.warning(f"文章字数 {len(article_text)} 低于最小阈值 {min_article_length}，跳过处理: {link}")
            return

        title = extract_content_until_punctuation(article_text).replace("正文：", "")

        from datetime import datetime
        current_time = datetime.now().strftime("%H:%M:%S")
        print("当前时间:", current_time)

        if ai_service == "coze":
            logger.info("coze正在处理")
            logger.info(f"正在处理的文章类型为：{generation_type}")
            if current_template:
                original_config = {
                    'workflow_id': CONFIG['Coze']['workflow_id'],
                    'access_token': CONFIG['Coze']['access_token'],
                    'is_async': CONFIG['Coze']['is_async']
                }

                CONFIG['Coze']['workflow_id'] = current_template.get('workflow_id', '')
                CONFIG['Coze']['access_token'] = current_template.get('access_token', '')
                CONFIG['Coze']['is_async'] = current_template.get('is_async', 'true')

                logger.info(f"应用模板配置: {current_template.get('name')}")
                logger.info(f"Workflow ID: {CONFIG['Coze']['workflow_id']}")
                logger.info(f"Access Token: {'*' * len(CONFIG['Coze']['access_token'])}")
                logger.info(f"Is Async: {CONFIG['Coze']['is_async']}")

            try:
                input_data_template_str = CONFIG['Coze'].get('input_data_template')
                if not input_data_template_str:
                    raise ValueError("input_data_template 配置缺失")

                input_data_template = json.loads(input_data_template_str)

                if generation_type == "短篇":
                    input_data = {"article": article_text}
                    print("coze中输入：", input_data)
                    message_content = call_coze_article_workflow(input_data)
                elif generation_type == "文章":
                    print("原文中标题为：", title_text)
                    print("原文中内容为：", article_text)
                    input_data = {"title": title_text, "article": article_text}
                    print("发送的请求数据为：", input_data)
                    try:
                        result = call_coze_all_article_workflow(input_data)
                        if isinstance(result, dict) and 'error' in result:
                            raise Exception(result['error'])
                        title, message_content = result
                    except Exception as e:
                        logger.error(f"调用 Coze 工作流时出错: {e}")
                        raise
            finally:
                if original_config is not None:
                    for key, value in original_config.items():
                        CONFIG['Coze'][key] = str(value)

        # 确保message_content是字符串类型
        if not isinstance(message_content, str):
            logger.warning(f"message_content 类型异常: {type(message_content)}")
            message_content = str(message_content) if message_content else ""

        logger.info(f"message_content处理前类型: {type(message_content)}, 长度: {len(message_content)}")
        if message_content:
            logger.info(f"message_content前300字符: {message_content[:300]}")

        # 去除标题首尾的空格
        title_text = title_text.strip()

        # 处理 AI 工作流返回的内容
        if generation_type == "短篇":
            logger.info(f"message_content原始内容长度: {len(message_content) if message_content else 0}")
            logger.info(f"message_content前200字符: {message_content[:200] if message_content else '空'}")

            if isinstance(message_content, str) and "\n" in message_content:
                first_line, rest_content = message_content.split("\n", 1)
                logger.info(f"第一行内容: {first_line}")
                if len(first_line) < 100:
                    title = first_line.strip().rstrip('，。！？；：')
                    message_content = rest_content
                    logger.info(f"使用第一行作为标题: {title}")
                else:
                    title = extract_content_until_punctuation(message_content).replace("正文：", "")
                    logger.info(f"第一行过长，使用extract提取: {title}")
            else:
                title = extract_content_until_punctuation(message_content).replace("正文：", "")
                logger.info(f"无换行，使用extract提取: {title}")

            logger.info(f"最终title: '{title}'")
            logger.info(f"title长度: {len(title)}")
        elif generation_type == "文章":
            pass

        # 创建类型目录
        type_dir = os.path.join(ARTICLES_BASE_PATH, article_type)
        safe_open_directory(type_dir)

        # 在类型目录下保存文章
        file_name = ""
        title_to_use = title.strip() if title else ""
        logger.info(f"用于文件名的title: '{title_to_use}', 长度: {len(title_to_use)}")

        if generation_type == '短篇':
            file_name = handle_duplicate_files_advanced(type_dir, title_to_use)[0]
        elif generation_type == "文章":
            file_name = handle_duplicate_files_advanced(type_dir, title_to_use)[0]

        logger.info(f"生成的file_name: '{file_name}'")

        article_save_path = os.path.join(type_dir, f"{file_name}.txt")

        if "```" in message_content:
            message_content = message_content.replace("```", "")

        message_content = str(title) + "\n" + str(message_content)

        # 收集文章信息
        original_word_count = len(article_text)
        image_count = len(img_urls)
        rewritten_word_count = len(message_content)

        # 原创度检测（根据配置决定是否启用）
        enable_plagiarism_detection = CONFIG['General'].get('enable_plagiarism_detection', 'false').lower() == 'true'
        similarity_score = None

        if enable_plagiarism_detection:
            print("正在进行原创度检测")
            try:
                detector = FinalPlagiarismDetector()
                result = detector.analyze_similarity(article_text, message_content)
                similarity_score = result['overall_similarity']
                print(f"原创度检测完成，相似度: {similarity_score:.2%}")
            except Exception as e:
                logger.error(f"原创度检测出错: {e}")
                similarity_score = None
        else:
            print("原创度检测已禁用，跳过检测")

        # 判断文章合规度（根据配置决定是否启用）
        enable_detection = CONFIG['Baidu'].get('enable_detection', 'false').lower() == 'true'
        save_violation_articles = CONFIG['Baidu'].get('save_violation_articles', 'true').lower() == 'true'

        is_violation = False
        if enable_detection:
            print("正在检测文章合规度")
            detection_result = text_detection(message_content)
            if detection_result == "合规":
                print("文章合规")
                is_violation = False
            else:
                print(f"文章不合规: {detection_result}")
                is_violation = True
                if not save_violation_articles:
                    if app:
                        display_title = title
                        app.update_article_info_list(display_title, original_word_count, image_count, rewritten_word_count, True, similarity_score)
                    return
        else:
            print("违规检测已禁用，跳过检测")
            is_violation = False

        # 如果是违规文章且需要保存，在文件名中添加"已违规"标识
        if is_violation and save_violation_articles:
            name_part, ext_part = os.path.splitext(article_save_path)
            article_save_path = f"{name_part}_已违规{ext_part}"

        with open(article_save_path, 'w', encoding='utf-8') as f:
            f.write(message_content)
        logging.info('文本已经保存')

        # 更新GUI中的文章信息列表
        if app:
            display_title = title
            app.update_article_info_list(display_title, original_word_count, image_count, rewritten_word_count, is_violation, similarity_score)

        if img_urls:
            type_picture_dir = os.path.join(IMGS_BASE_PATH, article_type)
            safe_open_directory(type_picture_dir)
            download_and_process_images(img_urls, file_name.strip(), type_picture_dir)

    except Exception as e:
        logging.error(f"处理链接 {link} 时出错: {e}")
        raise


def link_to_text(num_threads=None, ai_service="coze", current_template=None, generation_type=None, app=None):
    use_link_path = 'use_link_path.txt'

    # 读取链接
    links = read_excel(TITLE_BASE_PATH)

    filtered_links = []

    for link_info in links:
        link = link_info[0].strip()
        article_type = link_info[1].strip() if len(link_info) > 1 and link_info[1].strip() else generation_type
        logging.info(f"总共{len(links)}个链接")
        filtered_links.append((link, article_type))

    if not filtered_links:
        logger.info("没有新链接需要处理")
        return []

    # 使用多线程处理链接
    results = process_links_with_threads(filtered_links, num_threads, ai_service, current_template, generation_type, app)

    # 记录已处理的链接
    with open(use_link_path, 'a+', encoding='utf-8') as f:
        for result in results:
            if isinstance(result, tuple) and len(result) >= 2:
                link_info, success, _ = result
                if isinstance(link_info, (tuple, list)):
                    link = link_info[0]
                else:
                    link = link_info
                if success:
                    f.write(str(link) + "\n")
            else:
                logger.warning(f"意外的结果格式: {result}")

    return results


# 创建一个任务队列和结果队列
task_queue = queue.Queue()
result_queue = queue.Queue()


# 全局暂停事件
pause_event = threading.Event()
pause_event.set()


# 工作线程函数
def worker(ai_service, current_template=None, generation_type=None, app=None):
    while True:
        try:
            pause_event.wait()

            link_info = task_queue.get()
            if link_info is None:
                break

            link, article_type = link_info

            try:
                logger.info(f"开始处理链接：{link}")
                process_link((link, article_type), ai_service, current_template, generation_type, app)
                result_queue.put(((link, article_type), True, None))
            except Exception as e:
                result_queue.put(((link, article_type), False, str(e)))
                logger.error(f"处理链接 {link} 时出错: {e}")

            task_queue.task_done()

            pause_event.wait()
        except Exception as e:
            logger.error(f"工作线程出错: {e}")


# 多线程处理链接
def process_links_with_threads(links, num_threads=None, ai_service="coze", current_template=None, generation_type=None, app=None):
    if num_threads is None:
        num_threads = min(MAX_THREADS, len(links))
    else:
        num_threads = min(num_threads, MAX_THREADS, len(links))

    while not task_queue.empty():
        task_queue.get()
    while not result_queue.empty():
        result_queue.get()

    threads = []

    for _ in range(num_threads):
        t = threading.Thread(target=worker, args=(ai_service, current_template, generation_type, app))
        t.daemon = True
        t.start()
        threads.append(t)

    for link in links:
        task_queue.put(link)

    for _ in range(num_threads):
        task_queue.put(None)

    for t in threads:
        t.join()

    results = []
    while not result_queue.empty():
        results.append(result_queue.get())

    return results