修改获取网页内容代码

2025-05-30 17:59:58 +08:00 · 2025-05-30 17:59:58 +08:00 · 42fc2e661f
commit 42fc2e661f
parent 0792027bea
8 changed files with 205 additions and 53 deletions
--- a/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py
+++ b/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py
@ -29,7 +29,7 @@ class ArticleReplaceApp(tk.Tk):
    def __init__(self):
        super().__init__()

-        self.title("文章采集与处理工具")
+        self.title("文章工作流调用工具（软件仅供交流使用）")
        self.geometry("900x600")

        # 创建标签页控件
@ -44,6 +44,11 @@ class ArticleReplaceApp(tk.Tk):
        self.config_frame = ttk.Frame(self.notebook)
        self.notebook.add(self.config_frame, text="配置")

+        # 创建免责声明页面
+        self.disclaimer_frame = ttk.Frame(self.notebook)
+        self.notebook.add(self.disclaimer_frame, text="免责声明")
+
+        
        # 初始化变量
        self.running = False
        self.thread = None
@ -67,6 +72,8 @@ class ArticleReplaceApp(tk.Tk):
        self.init_main_frame()
        # 初始化配置页面
        self.init_config_frame()
+        # 初始化免责声明页面
+        self.init_disclaimer_frame()

        # 设置关闭窗口事件
        self.protocol("WM_DELETE_WINDOW", self.on_close)
@ -407,6 +414,54 @@ class ArticleReplaceApp(tk.Tk):
        parent.columnconfigure(0, weight=1)
        parent.rowconfigure(1, weight=1)
        
+    def init_disclaimer_frame(self):
+        # 创建免责声明内容框架
+        disclaimer_content = ttk.Frame(self.disclaimer_frame)
+        disclaimer_content.pack(fill=tk.BOTH, expand=True, padx=20, pady=20)
+        
+        # 标题
+        title_label = ttk.Label(disclaimer_content, text="免责声明", font=("Arial", 16, "bold"))
+        title_label.pack(pady=10)
+        
+        # 免责声明文本
+        disclaimer_text = ScrolledText(disclaimer_content, width=80, height=20, wrap=tk.WORD)
+        disclaimer_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
+        disclaimer_text.insert(tk.END, """
+软件使用免责声明
+
+1. 合法使用声明
+   本软件仅供合法、正当用途使用。用户应当遵守中华人民共和国相关法律法规，不得将本软件用于任何违法犯罪活动。
+
+2. 内容责任声明
+   用户通过本软件生成、处理或发布的所有内容，其版权归属、合法性及内容真实性由用户自行负责。本软件开发者不对用户使用本软件处理的内容承担任何法律责任。
+
+3. 使用风险声明
+   用户应自行承担使用本软件的风险。本软件按"现状"提供，不提供任何明示或暗示的保证，包括但不限于适销性、特定用途适用性和非侵权性的保证。
+
+4. 禁止用途
+   严禁将本软件用于以下活动：
+   - 违反国家法律法规的活动
+   - 侵犯他人知识产权或其他合法权益的活动
+   - 传播虚假、欺诈或误导性信息的活动
+   - 从事任何可能危害国家安全、社会稳定的活动
+   - 其他违背社会公德、商业道德的活动
+
+5. 责任限制
+   在法律允许的最大范围内，对于因使用或无法使用本软件而导致的任何直接、间接、偶然、特殊、惩罚性或后果性损害，本软件开发者不承担任何责任。
+
+6. 协议更新
+   本免责声明可能会不定期更新，更新后的内容将在软件中公布，不再另行通知。用户继续使用本软件即表示接受修改后的免责声明。
+
+7. 最终解释
+   本免责声明的最终解释权归本软件开发者所有。
+        """)
+        disclaimer_text.config(state=tk.DISABLED)  # 设置为只读
+        
+        # 确认按钮
+        confirm_frame = ttk.Frame(disclaimer_content)
+        confirm_frame.pack(pady=10)
+        ttk.Button(confirm_frame, text="我已阅读并同意以上声明", command=lambda: self.notebook.select(0)).pack()
+
    def save_banned_words(self):
        # 处理文本，将换行符替换为逗号
        words = self.banned_words_text.get(1.0, tk.END).strip().replace('\n', ',')
@ -694,7 +749,7 @@ class ArticleReplaceApp(tk.Tk):
        # 创建模板选择对话框
        dialog = tk.Toplevel(self)
        dialog.title("选择模板")
-        dialog.geometry("400x300")
+        dialog.geometry("400x400")
        dialog.transient(self)  # 设置为应用程序的子窗口
        dialog.grab_set()  # 模态对话框
        dialog.resizable(False, False)
@ -1152,7 +1207,7 @@ class ArticleReplaceApp(tk.Tk):
            logger.info(f"开始处理链接，使用 {num_threads} 个线程，生成类型: {generation_type}")
            if current_template:
                logger.info(f"使用模板: {current_template.get('name', '未命名')}")
-            results = link_to_text(num_threads=num_threads, ai_service=ai_service, current_template=current_template)
+            results = link_to_text(num_threads=num_threads, ai_service=ai_service, current_template=current_template, generation_type=generation_type)

            # 计算处理结果
            total_links = len(results)
@ -1172,7 +1227,8 @@ class ArticleReplaceApp(tk.Tk):
                                                      f"共处理 {total_links} 个链接\n成功: {success_links} 个\n失败: {total_links - success_links} 个\n总耗时: {elapsed_time:.2f} 秒"))
        except Exception as e:
            logger.error(f"处理任务出错: {e}")
-            self.after(0, lambda e=e: messagebox.showerror("处理错误", f"处理任务出错: {e}"))
+            self.after(0, lambda: messagebox.showerror("处理错误", f"处理任务出错: {e}"))
+            # self.after(0, lambda e=e: messagebox.showerror("处理错误", f"处理任务出错: {e}"))
        finally:
            # 恢复原始配置（如果有的话）
            if original_config is not None:
--- a/ArticleReplaceBatch/ai_studio.py
+++ b/ArticleReplaceBatch/ai_studio.py
@ -124,3 +124,51 @@ def call_coze_article_workflow(parameters):
            "error": f"请求失败，状态码：{response.status_code}",
            "detail": response.text
        }
+
+
+def call_coze_chang_article_workflow(parameters):
+    """
+    调用 Coze 工作流的函数
+
+    :param parameters: 传递给工作流的输入参数（字典格式）
+    :param is_async: 是否异步执行（默认 False）
+    :return: 工作流的执行结果
+    """
+
+    workflow_id = CONFIG['Coze']['workflow_id']
+    access_token = CONFIG['Coze']['access_token']
+    is_async = CONFIG['Coze']['is_async'].lower() == 'true'
+    url = "https://api.coze.cn/v1/workflow/run"
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "workflow_id": workflow_id,
+        "parameters": parameters,
+        "is_async": is_async
+    }
+
+    response = requests.post(url, json=data, headers=headers)
+
+    if response.status_code == 200:
+        # data = json.loads(response.text)['data']
+        # print("data：",data['output'])
+        import ast
+
+        # 直接解析整个result字符串
+        result_dict = ast.literal_eval(response.text)
+
+        # 解析data字段
+        data_dict = ast.literal_eval(result_dict['data'])
+
+        # 获取output的值
+        title = data_dict['title']
+        article = data_dict['article']
+
+        return title,article
+    else:
+        return {
+            "error": f"请求失败，状态码：{response.status_code}",
+            "detail": response.text
+        }
--- a/ArticleReplaceBatch/articles/情感/23岁河南“懒人”懒死家中，让人警醒：家庭最大悲哀是无底线纵容.txt
+++ b/ArticleReplaceBatch/articles/情感/23岁河南“懒人”懒死家中，让人警醒：家庭最大悲哀是无底线纵容.txt
@ -0,0 +1,12 @@
+
+
+你绝对想不到！江苏王女士最近收到电费单时惊了，夏天每月电费突然涨到800元。她翻出家里所有电器，连路由器都拔了，结果第二个月电费反而涨到900块！
+
+据《现代快报》报道，供电局工作人员上门检查后才发现，罪魁祸首是待机状态的空调。王女士家3台空调插头都没拔，每月能白白耗掉200多度电。这事让不少网友直呼"活久见"，有人留言："我家电视常年插着电源，难怪电费总降不下来！"
+
+其实国家电网早做过测试，普通家电待机功率在13瓦之间。按每天待机20小时算，光机顶盒一年就能吃掉30度电。更扎心的是，很多家庭至少有5台电器长期插着电，一年下来相当于白交三百块！
+
+我特意翻出家里老电表，发现拔掉所有插头后，电表真的转得慢了。现在我家冰箱外的电器用完就拔，这个月省了五十多电费。你家电表跑得快吗？赶紧试试拔插头吧！
+
+生活窍门 家庭用电 省电妙招 居家过日子 
+你家最近电费有变化吗？评论区聊聊你的省电妙招吧！
--- a/ArticleReplaceBatch/articles/情感/姑父56万寻人后续：两个侄子已找到痛哭后悔，内情曝光，网友炸锅.txt
+++ b/ArticleReplaceBatch/articles/情感/姑父56万寻人后续：两个侄子已找到痛哭后悔，内情曝光，网友炸锅.txt
@ -0,0 +1,11 @@
+
+
+上海垃圾分类新规实施半个月，罚款总额突破200万！据东方网报道，光是黄浦区就开出了2.3万张罚单，平均每分钟都有居民被处罚。我家楼下王阿姨前天刚被罚了50块，就因为在垃圾站门口多站了半分钟。
+
+可你绝对想不到，全市60%的罚款都集中在3个高档小区。这些小区明明配置了智能分类设备，还有专人指导，结果反而成了"重灾区"。隔壁张叔气得直拍大腿："我天天在家分拣半小时，最后还因为垃圾袋颜色不对被罚！"
+
+据环保局数据显示，新规实施后厨余垃圾分拣正确率反而下降了5%。这事真不能全怪老百姓，有些小区督导员自己都搞不清分类标准。我亲眼见过督导员把干电池扔进有害垃圾箱，那可是要扣分的啊！
+
+不过话说回来，垃圾分类确实是利国利民的好事。关键是不能"一刀切"，得给大伙儿适应时间。听说杭州试点"三次提醒再罚款"的模式，效果反而更好。这事您怎么看？您家小区垃圾分类顺利吗？
+
+垃圾分类新规 罚款争议 上海热点 社区管理 民生政策
--- a/ArticleReplaceBatch/main_process_wtt.py
+++ b/ArticleReplaceBatch/main_process_wtt.py
@ -2,7 +2,7 @@ import threading
 import queue
 import json  # 导入 json 模块

-from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
+from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow,call_coze_chang_article_workflow
 from databases import *

 from images_edit import download_and_process_images
@ -12,7 +12,7 @@ from config import *


 # ==============================主程序===========================
-def process_link(link_info, ai_service, current_template=None):
+def process_link(link_info, ai_service, current_template=None,generation_type=None):
    link, article_type = link_info  # 解包链接和类型信息
    """
    处理单个链接
@ -48,8 +48,6 @@ def process_link(link_info, ai_service, current_template=None):

        title = extract_content_until_punctuation(article_text).replace("正文：", "")

-        print(img_urls)
-        print(article_text)

        from datetime import datetime

@ -83,7 +81,7 @@ def process_link(link_info, ai_service, current_template=None):
            message_content = call_dify_workflow(input_data)
        elif ai_service == "coze":
            logger.info("coze正在处理")
-            
+            logger.info(f"正在处理的文章类型为：{generation_type}")
            # 如果有模板配置，临时更新CONFIG
            original_config = None
            if current_template:
@ -120,7 +118,13 @@ def process_link(link_info, ai_service, current_template=None):
                }
                
            try:
-                message_content = call_coze_article_workflow(input_data)
+                title = ""
+                if generation_type == "短篇":
+                    message_content = call_coze_article_workflow(input_data)
+                elif generation_type == "文章":
+                    title, message_content = call_coze_chang_article_workflow(input_data)
+
+
            finally:
                # 恢复原始配置（如果有的话）
                if original_config is not None:
@ -134,9 +138,11 @@ def process_link(link_info, ai_service, current_template=None):

        # 打印当前时间
        print("当前时间:", current_time)
-
-
-        file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
+        file_name = ""
+        if generation_type == '短篇':
+            file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
+        elif generation_type == "文章":
+            file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title)[0]


        # 创建类型目录
@ -146,22 +152,11 @@ def process_link(link_info, ai_service, current_template=None):
        # 在类型目录下保存文章
        article_save_path = os.path.join(type_dir, f"{file_name}.txt")

-        if '*' in message_content or '#' in message_content or "-" in message_content:
-            # 使用正则表达式一次性替换多个字符
-            old_content = re.sub(r'[*#-]', '', message_content)
-        else:
-            # 如果不需要替换，直接使用原内容
-            old_content = message_content
-
-        print("改写完成的文章：" + old_content)
-
-        # 删除AI词汇
-        content = old_content



        # 判断文章合规度
-        if text_detection(content) == "合规":
+        if text_detection(message_content) == "合规":
            print("文章合规")
            pass
        else:
@ -169,7 +164,7 @@ def process_link(link_info, ai_service, current_template=None):
            return

        with open(article_save_path, 'w', encoding='utf-8') as f:
-            f.write(content)
+            f.write(message_content)
        logging.info('文本已经保存')

        if img_urls:
@ -183,7 +178,7 @@ def process_link(link_info, ai_service, current_template=None):
        raise


-def link_to_text(num_threads=None, ai_service="dify", current_template=None):
+def link_to_text(num_threads=None, ai_service="dify", current_template=None, generation_type=None):
    use_link_path = 'use_link_path.txt'

    # 读取链接
@ -198,7 +193,8 @@ def link_to_text(num_threads=None, ai_service="dify", current_template=None):

    for link_info in links:
        link = link_info[0].strip()  # 获取链接并去除空白字符
-        article_type = link_info[1].strip()  # 获取类型并去除空白字符
+        # 如果Excel中有类型，使用Excel中的类型，否则使用传入的generation_type
+        article_type = link_info[1].strip() if len(link_info) > 1 and link_info[1].strip() else generation_type
        logging.info(f"总共{len(links)}个链接")
        # if check_link_exists(host, user, password, database, link):
        # logger.info(f"链接已存在: {link}")
@ -213,7 +209,7 @@ def link_to_text(num_threads=None, ai_service="dify", current_template=None):
        return []

    # 使用多线程处理链接
-    results = process_links_with_threads(filtered_links, num_threads, ai_service, current_template)
+    results = process_links_with_threads(filtered_links, num_threads, ai_service, current_template,generation_type)

    # 记录已处理的链接
    with open(use_link_path, 'a+', encoding='utf-8') as f:
@ -230,7 +226,7 @@ result_queue = queue.Queue()


 # 工作线程函数
-def worker(ai_service, current_template=None):
+def worker(ai_service, current_template=None,generation_type=None):
    while True:
        try:
            # 从队列中获取任务
@ -241,7 +237,7 @@ def worker(ai_service, current_template=None):
            # 处理链接
            try:
                logger.info(f"开始处理链接：{link}")
-                process_link(link, ai_service, current_template)
+                process_link(link, ai_service, current_template,generation_type)
                result_queue.put((link, True, None))  # 成功
            except Exception as e:
                result_queue.put((link, False, str(e)))  # 失败
@ -254,7 +250,7 @@ def worker(ai_service, current_template=None):


 # 多线程处理链接
-def process_links_with_threads(links, num_threads=None, ai_service="dify", current_template=None):
+def process_links_with_threads(links, num_threads=None, ai_service="dify", current_template=None,generation_type=None):
    if num_threads is None:
        num_threads = min(MAX_THREADS, len(links))
    else:
@ -271,7 +267,7 @@ def process_links_with_threads(links, num_threads=None, ai_service="dify", curre

    # 将AI服务选择和模板配置传递给worker函数
    for _ in range(num_threads):
-        t = threading.Thread(target=worker, args=(ai_service, current_template))
+        t = threading.Thread(target=worker, args=(ai_service, current_template,generation_type))
        t.daemon = True
        t.start()
        threads.append(t)
--- a/ArticleReplaceBatch/picture/情感/23岁河南“懒人”懒死家中，让人警醒：家庭最大悲哀是无底线纵容/图片0.jpg
+++ b/ArticleReplaceBatch/picture/情感/23岁河南“懒人”懒死家中，让人警醒：家庭最大悲哀是无底线纵容/图片0.jpg
--- a/ArticleReplaceBatch/picture/情感/姑父56万寻人后续：两个侄子已找到痛哭后悔，内情曝光，网友炸锅/图片0.jpg
+++ b/ArticleReplaceBatch/picture/情感/姑父56万寻人后续：两个侄子已找到痛哭后悔，内情曝光，网友炸锅/图片0.jpg
--- a/ArticleReplaceBatch/test.py
+++ b/ArticleReplaceBatch/test.py
@ -1,27 +1,56 @@
-import json
-
 import requests

-from bs4 import BeautifulSoup

-from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content
+def call_coze_article_workflow(workflow_id,access_token,parameters,is_async=False):
+    """
+    调用 Coze 工作流的函数

-from utils import handle_duplicate_files_advanced
-from images_edit import download_and_process_images
+    :param parameters: 传递给工作流的输入参数（字典格式）
+    :param is_async: 是否异步执行（默认 False）
+    :return: 工作流的执行结果
+    """

-# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
-# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
-# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
-# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
-title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")

-print(title)
-print(article)
-print(imgs)
-print(type(imgs))
-#
-# download_and_process_images(imgs,"1")
+    url = "https://api.coze.cn/v1/workflow/run"
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "workflow_id": workflow_id,
+        "parameters": parameters,
+        "is_async": is_async
+    }

-#
-# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
-# print(name[0])
+    response = requests.post(url, json=data, headers=headers)
+
+    if response.status_code == 200:
+        # data = json.loads(response.text)['data']
+        # print("data：",data['output'])
+        import ast
+
+        # 直接解析整个result字符串
+        result_dict = ast.literal_eval(response.text)
+
+        # 解析data字段
+        data_dict = ast.literal_eval(result_dict['data'])
+
+        # 获取output的值
+        title = data_dict['title']
+        article = data_dict['article']
+        return title, article
+    else:
+        return {
+            "error": f"请求失败，状态码：{response.status_code}",
+            "detail": response.text
+        }
+
+
+workflow_id = "7509764025128845366"
+access_token = "pat_0DczPLquEPhA3mSqokHTPpU9KNHrM3mz5sZKSWxi7ZeWK1Fi5UjPzQihq1DwCQ91"
+parameters = {
+    "title":"1",
+    "article":"1"
+}
+title,article = call_coze_article_workflow(workflow_id,access_token,parameters)
+print(title,article)