修改获取网页内容代码
This commit is contained in:
parent
113c97c887
commit
666494c0c2
@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk):
|
|||||||
|
|
||||||
# AI服务提供商选择
|
# AI服务提供商选择
|
||||||
ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
|
ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
|
||||||
self.ai_service_var = tk.StringVar(value="dify")
|
self.ai_service_var = tk.StringVar(value="coze")
|
||||||
ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
|
ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
|
||||||
ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)
|
ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)
|
||||||
|
|
||||||
@ -87,7 +87,10 @@ def download_and_process_images(img_urls, article_title):
|
|||||||
"""
|
"""
|
||||||
下载并处理图片
|
下载并处理图片
|
||||||
"""
|
"""
|
||||||
img_dir_path = os.path.join(IMGS_BASE_PATH, article_title)
|
logger.info(IMGS_BASE_PATH)
|
||||||
|
img_dir_path = os.path.join(str(IMGS_BASE_PATH), str(article_title))
|
||||||
|
# img_dir_path = IMGS_BASE_PATH + "/" + article_title
|
||||||
|
logger.info(img_dir_path)
|
||||||
safe_open_directory(img_dir_path)
|
safe_open_directory(img_dir_path)
|
||||||
|
|
||||||
for i, img_url in enumerate(img_urls):
|
for i, img_url in enumerate(img_urls):
|
||||||
|
|||||||
@ -26,6 +26,8 @@ def process_link(link, ai_service):
|
|||||||
else:
|
else:
|
||||||
title_text, article_text, img_urls = "", "", []
|
title_text, article_text, img_urls = "", "", []
|
||||||
|
|
||||||
|
print(title_text)
|
||||||
|
|
||||||
if title_text == "":
|
if title_text == "":
|
||||||
return
|
return
|
||||||
elif len(title_text) > 100:
|
elif len(title_text) > 100:
|
||||||
@ -111,7 +113,7 @@ def process_link(link, ai_service):
|
|||||||
|
|
||||||
# finally_article = message_content.replace("正文:", "") + "\n"
|
# finally_article = message_content.replace("正文:", "") + "\n"
|
||||||
|
|
||||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)
|
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
|
||||||
|
|
||||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
|
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import threading
|
|||||||
import queue
|
import queue
|
||||||
import json # 导入 json 模块
|
import json # 导入 json 模块
|
||||||
|
|
||||||
from ai_studio import call_dify_workflow, call_coze_workflow
|
from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
|
||||||
from databases import *
|
from databases import *
|
||||||
|
|
||||||
from images_edit import download_and_process_images
|
from images_edit import download_and_process_images
|
||||||
@ -25,6 +25,8 @@ def process_link(link, ai_service):
|
|||||||
title_text, article_text, img_urls = toutiao_extract_content(link)
|
title_text, article_text, img_urls = toutiao_extract_content(link)
|
||||||
elif link.startswith("https://mp.weixin.qq.co"):
|
elif link.startswith("https://mp.weixin.qq.co"):
|
||||||
title_text, article_text, img_urls = wechat_extract_content(link)
|
title_text, article_text, img_urls = wechat_extract_content(link)
|
||||||
|
elif link.startswith("https://www.163.com"):
|
||||||
|
title_text, article_text, img_urls = wangyi_extract_content(link)
|
||||||
else:
|
else:
|
||||||
title_text, article_text, img_urls = "", "", []
|
title_text, article_text, img_urls = "", "", []
|
||||||
|
|
||||||
@ -44,7 +46,7 @@ def process_link(link, ai_service):
|
|||||||
|
|
||||||
title = extract_content_until_punctuation(article_text).replace("正文:", "")
|
title = extract_content_until_punctuation(article_text).replace("正文:", "")
|
||||||
|
|
||||||
print(title)
|
print(img_urls)
|
||||||
print(article_text)
|
print(article_text)
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -78,55 +80,56 @@ def process_link(link, ai_service):
|
|||||||
# }
|
# }
|
||||||
message_content = call_dify_workflow(input_data)
|
message_content = call_dify_workflow(input_data)
|
||||||
elif ai_service == "coze":
|
elif ai_service == "coze":
|
||||||
logger.info("coze正在处理")
|
# logger.info("coze正在处理")
|
||||||
weijin = ""
|
# weijin = ""
|
||||||
if check_keywords:
|
# if check_keywords:
|
||||||
weijin = "违禁"
|
# weijin = "违禁"
|
||||||
# 从配置加载 Coze input_data 模板
|
# # 从配置加载 Coze input_data 模板
|
||||||
input_data_template_str = CONFIG['Coze'].get('input_data_template',
|
# input_data_template_str = CONFIG['Coze'].get('input_data_template',
|
||||||
'{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
|
# '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
|
||||||
try:
|
# try:
|
||||||
# 解析模板字符串为字典
|
# # 解析模板字符串为字典
|
||||||
input_data_template = json.loads(input_data_template_str)
|
# input_data_template = json.loads(input_data_template_str)
|
||||||
# 使用实际变量格式化模板
|
# # 使用实际变量格式化模板
|
||||||
input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
|
# input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
|
||||||
input_data_template.items()}
|
# input_data_template.items()}
|
||||||
except (json.JSONDecodeError, KeyError, AttributeError) as e:
|
# except (json.JSONDecodeError, KeyError, AttributeError) as e:
|
||||||
logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
|
# logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
|
||||||
input_data = {
|
# input_data = {
|
||||||
"article": article_text,
|
# "article": article_text,
|
||||||
"link": link,
|
# "link": link,
|
||||||
"weijin": weijin
|
# "weijin": weijin
|
||||||
|
# }
|
||||||
|
input_data = {
|
||||||
|
"article": article_text
|
||||||
}
|
}
|
||||||
|
message_content = call_coze_article_workflow(input_data)
|
||||||
|
|
||||||
msg = call_coze_workflow(input_data)
|
|
||||||
message_content = msg['article']
|
|
||||||
result = msg['result']
|
|
||||||
if result == "已经创作过":
|
|
||||||
return
|
|
||||||
# 获取当前时间并格式化
|
# 获取当前时间并格式化
|
||||||
current_time = datetime.now().strftime("%H:%M:%S")
|
current_time = datetime.now().strftime("%H:%M:%S")
|
||||||
|
|
||||||
# 打印当前时间
|
# 打印当前时间
|
||||||
print("当前时间:", current_time)
|
print("当前时间:", current_time)
|
||||||
|
|
||||||
finally_article = message_content.replace("正文:", "") + "\n"
|
|
||||||
|
|
||||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt")
|
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
|
||||||
|
|
||||||
if '*' in finally_article or '#' in finally_article or "-" in finally_article:
|
|
||||||
|
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
|
||||||
|
|
||||||
|
if '*' in message_content or '#' in message_content or "-" in message_content:
|
||||||
# 使用正则表达式一次性替换多个字符
|
# 使用正则表达式一次性替换多个字符
|
||||||
old_content = re.sub(r'[*#-]', '', message_content)
|
old_content = re.sub(r'[*#-]', '', message_content)
|
||||||
else:
|
else:
|
||||||
# 如果不需要替换,直接使用原内容
|
# 如果不需要替换,直接使用原内容
|
||||||
old_content = finally_article
|
old_content = message_content
|
||||||
|
|
||||||
print("改写完成的文章:" + old_content)
|
print("改写完成的文章:" + old_content)
|
||||||
|
|
||||||
# 删除AI词汇
|
# 删除AI词汇
|
||||||
content = old_content
|
content = old_content
|
||||||
|
|
||||||
check_link_insert(host, user, password, database, link)
|
|
||||||
|
|
||||||
# 判断文章合规度
|
# 判断文章合规度
|
||||||
if text_detection(content) == "合规":
|
if text_detection(content) == "合规":
|
||||||
@ -141,7 +144,7 @@ def process_link(link, ai_service):
|
|||||||
logging.info('文本已经保存')
|
logging.info('文本已经保存')
|
||||||
|
|
||||||
if img_urls:
|
if img_urls:
|
||||||
download_and_process_images(img_urls, title)
|
download_and_process_images(img_urls, file_name)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"处理链接 {link} 时出错: {e}")
|
logging.error(f"处理链接 {link} 时出错: {e}")
|
||||||
@ -163,13 +166,13 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify"
|
|||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
logging.info(f"总共{len(links)}个链接")
|
logging.info(f"总共{len(links)}个链接")
|
||||||
if check_link_exists(host, user, password, database, link):
|
# if check_link_exists(host, user, password, database, link):
|
||||||
logger.info(f"链接已存在: {link}")
|
# logger.info(f"链接已存在: {link}")
|
||||||
continue
|
# continue
|
||||||
else:
|
# else:
|
||||||
filtered_links.append(link)
|
filtered_links.append(link)
|
||||||
logger.info(f"链接不存在: {link}")
|
# logger.info(f"链接不存在: {link}")
|
||||||
print("链接不存在,存储到过滤器中:", link)
|
# print("链接不存在,存储到过滤器中:", link)
|
||||||
|
|
||||||
if not filtered_links:
|
if not filtered_links:
|
||||||
logger.info("没有新链接需要处理")
|
logger.info("没有新链接需要处理")
|
||||||
|
|||||||
@ -4,17 +4,24 @@ import requests
|
|||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content
|
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content
|
||||||
|
|
||||||
from utils import handle_duplicate_files_advanced
|
from utils import handle_duplicate_files_advanced
|
||||||
|
from images_edit import download_and_process_images
|
||||||
|
|
||||||
# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
|
# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
|
||||||
# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/")
|
# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
|
||||||
# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
|
# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
|
||||||
# print(imgs)
|
# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
|
||||||
# print(type(imgs))
|
title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")
|
||||||
|
|
||||||
|
print(title)
|
||||||
|
print(article)
|
||||||
|
print(imgs)
|
||||||
|
print(type(imgs))
|
||||||
|
#
|
||||||
|
# download_and_process_images(imgs,"1")
|
||||||
|
|
||||||
name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
|
#
|
||||||
print(name[0])
|
# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
|
||||||
|
# print(name[0])
|
||||||
@ -145,9 +145,15 @@ def insert_images_into_paragraphs(paragraphs, image_folder, doc, title):
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if os.path.exists(image_folder):
|
||||||
|
images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||||
|
img.lower().endswith(('jpg'))])
|
||||||
|
else:
|
||||||
|
images = []
|
||||||
|
|
||||||
# 获取图片列表并排序
|
# 获取图片列表并排序
|
||||||
images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||||
img.lower().endswith(('jpg'))])
|
# img.lower().endswith(('jpg'))])
|
||||||
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||||
# # img.lower().endswith(('png', 'jpg', 'jpeg'))])
|
# # img.lower().endswith(('png', 'jpg', 'jpeg'))])
|
||||||
|
|
||||||
@ -271,7 +277,14 @@ def txt2docx(txt_path, image_path, keep_txt=True):
|
|||||||
else:
|
else:
|
||||||
new_text = text.replace("```markdown", "").replace("```", "")
|
new_text = text.replace("```markdown", "").replace("```", "")
|
||||||
content = new_text
|
content = new_text
|
||||||
image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".")
|
# image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".")
|
||||||
|
# image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip("."))
|
||||||
|
from pathlib import Path
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
img_path = Path(img_path)
|
||||||
|
image_folder = img_path / txt_name.replace(".txt", "").rstrip(".")
|
||||||
|
|
||||||
# crop_and_replace_images(image_folder)
|
# crop_and_replace_images(image_folder)
|
||||||
|
|
||||||
create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)
|
create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user