修改获取网页内容代码

2025-05-12 14:56:51 +08:00 · 2025-05-12 14:56:51 +08:00 · 666494c0c2
commit 666494c0c2
parent 113c97c887
6 changed files with 80 additions and 52 deletions
--- a/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py
+++ b/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py
@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk):

        # AI服务提供商选择
        ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
-        self.ai_service_var = tk.StringVar(value="dify")
+        self.ai_service_var = tk.StringVar(value="coze")
        ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
        ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)

--- a/ArticleReplaceBatch/images_edit.py
+++ b/ArticleReplaceBatch/images_edit.py
@ -87,7 +87,10 @@ def download_and_process_images(img_urls, article_title):
    """
    下载并处理图片
    """
-    img_dir_path = os.path.join(IMGS_BASE_PATH, article_title)
+    logger.info(IMGS_BASE_PATH)
+    img_dir_path = os.path.join(str(IMGS_BASE_PATH), str(article_title))
+    # img_dir_path = IMGS_BASE_PATH + "/" + article_title
+    logger.info(img_dir_path)
    safe_open_directory(img_dir_path)

    for i, img_url in enumerate(img_urls):
--- a/ArticleReplaceBatch/main_process.py
+++ b/ArticleReplaceBatch/main_process.py
@ -26,6 +26,8 @@ def process_link(link, ai_service):
        else:
            title_text, article_text, img_urls = "", "", []

+        print(title_text)
+
        if title_text == "":
            return
        elif len(title_text) > 100:
@ -111,7 +113,7 @@ def process_link(link, ai_service):

        # finally_article = message_content.replace("正文：", "") + "\n"

-        file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)
+        file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]

        article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")

--- a/ArticleReplaceBatch/main_process_wtt.py
+++ b/ArticleReplaceBatch/main_process_wtt.py
@ -2,7 +2,7 @@ import threading
 import queue
 import json  # 导入 json 模块

-from ai_studio import call_dify_workflow, call_coze_workflow
+from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
 from databases import *

 from images_edit import download_and_process_images
@ -25,6 +25,8 @@ def process_link(link, ai_service):
                title_text, article_text, img_urls = toutiao_extract_content(link)
        elif link.startswith("https://mp.weixin.qq.co"):
            title_text, article_text, img_urls = wechat_extract_content(link)
+        elif link.startswith("https://www.163.com"):
+            title_text, article_text, img_urls = wangyi_extract_content(link)
        else:
            title_text, article_text, img_urls = "", "", []

@ -44,7 +46,7 @@ def process_link(link, ai_service):

        title = extract_content_until_punctuation(article_text).replace("正文：", "")

-        print(title)
+        print(img_urls)
        print(article_text)

        from datetime import datetime
@ -78,55 +80,56 @@ def process_link(link, ai_service):
            # }
            message_content = call_dify_workflow(input_data)
        elif ai_service == "coze":
-            logger.info("coze正在处理")
-            weijin = ""
-            if check_keywords:
-                weijin = "违禁"
-            # 从配置加载 Coze input_data 模板
-            input_data_template_str = CONFIG['Coze'].get('input_data_template',
-                                                         '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
-            try:
-                # 解析模板字符串为字典
-                input_data_template = json.loads(input_data_template_str)
-                # 使用实际变量格式化模板
-                input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
-                              input_data_template.items()}
-            except (json.JSONDecodeError, KeyError, AttributeError) as e:
-                logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
-                input_data = {
-                    "article": article_text,
-                    "link": link,
-                    "weijin": weijin
+            # logger.info("coze正在处理")
+            # weijin = ""
+            # if check_keywords:
+            #     weijin = "违禁"
+            # # 从配置加载 Coze input_data 模板
+            # input_data_template_str = CONFIG['Coze'].get('input_data_template',
+            #                                              '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
+            # try:
+            #     # 解析模板字符串为字典
+            #     input_data_template = json.loads(input_data_template_str)
+            #     # 使用实际变量格式化模板
+            #     input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
+            #                   input_data_template.items()}
+            # except (json.JSONDecodeError, KeyError, AttributeError) as e:
+            #     logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
+            #     input_data = {
+            #         "article": article_text,
+            #         "link": link,
+            #         "weijin": weijin
+            #     }
+            input_data = {
+                    "article": article_text
                }
+            message_content = call_coze_article_workflow(input_data)

-            msg = call_coze_workflow(input_data)
-            message_content = msg['article']
-            result = msg['result']
-            if result == "已经创作过":
-                return
        # 获取当前时间并格式化
        current_time = datetime.now().strftime("%H:%M:%S")

        # 打印当前时间
        print("当前时间:", current_time)

-        finally_article = message_content.replace("正文：", "") + "\n"

-        article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt")
+        file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]

-        if '*' in finally_article or '#' in finally_article or "-" in finally_article:
+
+        article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
+
+        if '*' in message_content or '#' in message_content or "-" in message_content:
            # 使用正则表达式一次性替换多个字符
            old_content = re.sub(r'[*#-]', '', message_content)
        else:
            # 如果不需要替换，直接使用原内容
-            old_content = finally_article
+            old_content = message_content

        print("改写完成的文章：" + old_content)

        # 删除AI词汇
        content = old_content

-        check_link_insert(host, user, password, database, link)
+

        # 判断文章合规度
        if text_detection(content) == "合规":
@ -141,7 +144,7 @@ def process_link(link, ai_service):
        logging.info('文本已经保存')

        if img_urls:
-            download_and_process_images(img_urls, title)
+            download_and_process_images(img_urls, file_name)

    except Exception as e:
        logging.error(f"处理链接 {link} 时出错: {e}")
@ -163,13 +166,13 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify"

    for link in links:
        logging.info(f"总共{len(links)}个链接")
-        if check_link_exists(host, user, password, database, link):
-            logger.info(f"链接已存在: {link}")
-            continue
-        else:
-            filtered_links.append(link)
-            logger.info(f"链接不存在: {link}")
-            print("链接不存在，存储到过滤器中：", link)
+        # if check_link_exists(host, user, password, database, link):
+        # logger.info(f"链接已存在: {link}")
+            # continue
+        # else:
+        filtered_links.append(link)
+            # logger.info(f"链接不存在: {link}")
+            # print("链接不存在，存储到过滤器中：", link)

    if not filtered_links:
        logger.info("没有新链接需要处理")
--- a/ArticleReplaceBatch/test.py
+++ b/ArticleReplaceBatch/test.py
@ -4,17 +4,24 @@ import requests

 from bs4 import BeautifulSoup

-from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content
+from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content

 from utils import handle_duplicate_files_advanced
-
+from images_edit import download_and_process_images

 # title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
-# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/")
+# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
 # title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
-# print(imgs)
-# print(type(imgs))
+# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
+title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")

+print(title)
+print(article)
+print(imgs)
+print(type(imgs))
+#
+# download_and_process_images(imgs,"1")

-name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
-print(name[0])
+#
+# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
+# print(name[0])
--- a/ArticleReplaceBatch/txt2docx.py
+++ b/ArticleReplaceBatch/txt2docx.py
@ -145,9 +145,15 @@ def insert_images_into_paragraphs(paragraphs, image_folder, doc, title):
    :return:
    """

+    if os.path.exists(image_folder):
+        images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
+                         img.lower().endswith(('jpg'))])
+    else:
+        images = []
+
    # 获取图片列表并排序
-    images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
-                     img.lower().endswith(('jpg'))])
+    # images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
+    #                  img.lower().endswith(('jpg'))])
    # images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
    #                 #  img.lower().endswith(('png', 'jpg', 'jpeg'))])

@ -271,7 +277,14 @@ def txt2docx(txt_path, image_path, keep_txt=True):
        else:
            new_text = text.replace("```markdown", "").replace("```", "")
        content = new_text
-        image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".")
+        # image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".")
+        # image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip("."))
+        from pathlib import Path
+        from pathlib import Path
+
+        img_path = Path(img_path)
+        image_folder = img_path / txt_name.replace(".txt", "").rstrip(".")
+
        # crop_and_replace_images(image_folder)

        create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)