修改获取网页内容代码
This commit is contained in:
parent
113c97c887
commit
666494c0c2
@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk):
|
||||
|
||||
# AI服务提供商选择
|
||||
ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
|
||||
self.ai_service_var = tk.StringVar(value="dify")
|
||||
self.ai_service_var = tk.StringVar(value="coze")
|
||||
ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
|
||||
ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)
|
||||
|
||||
@ -87,7 +87,10 @@ def download_and_process_images(img_urls, article_title):
|
||||
"""
|
||||
下载并处理图片
|
||||
"""
|
||||
img_dir_path = os.path.join(IMGS_BASE_PATH, article_title)
|
||||
logger.info(IMGS_BASE_PATH)
|
||||
img_dir_path = os.path.join(str(IMGS_BASE_PATH), str(article_title))
|
||||
# img_dir_path = IMGS_BASE_PATH + "/" + article_title
|
||||
logger.info(img_dir_path)
|
||||
safe_open_directory(img_dir_path)
|
||||
|
||||
for i, img_url in enumerate(img_urls):
|
||||
|
||||
@ -26,6 +26,8 @@ def process_link(link, ai_service):
|
||||
else:
|
||||
title_text, article_text, img_urls = "", "", []
|
||||
|
||||
print(title_text)
|
||||
|
||||
if title_text == "":
|
||||
return
|
||||
elif len(title_text) > 100:
|
||||
@ -111,7 +113,7 @@ def process_link(link, ai_service):
|
||||
|
||||
# finally_article = message_content.replace("正文:", "") + "\n"
|
||||
|
||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)
|
||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
|
||||
|
||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ import threading
|
||||
import queue
|
||||
import json # 导入 json 模块
|
||||
|
||||
from ai_studio import call_dify_workflow, call_coze_workflow
|
||||
from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
|
||||
from databases import *
|
||||
|
||||
from images_edit import download_and_process_images
|
||||
@ -25,6 +25,8 @@ def process_link(link, ai_service):
|
||||
title_text, article_text, img_urls = toutiao_extract_content(link)
|
||||
elif link.startswith("https://mp.weixin.qq.co"):
|
||||
title_text, article_text, img_urls = wechat_extract_content(link)
|
||||
elif link.startswith("https://www.163.com"):
|
||||
title_text, article_text, img_urls = wangyi_extract_content(link)
|
||||
else:
|
||||
title_text, article_text, img_urls = "", "", []
|
||||
|
||||
@ -44,7 +46,7 @@ def process_link(link, ai_service):
|
||||
|
||||
title = extract_content_until_punctuation(article_text).replace("正文:", "")
|
||||
|
||||
print(title)
|
||||
print(img_urls)
|
||||
print(article_text)
|
||||
|
||||
from datetime import datetime
|
||||
@ -78,55 +80,56 @@ def process_link(link, ai_service):
|
||||
# }
|
||||
message_content = call_dify_workflow(input_data)
|
||||
elif ai_service == "coze":
|
||||
logger.info("coze正在处理")
|
||||
weijin = ""
|
||||
if check_keywords:
|
||||
weijin = "违禁"
|
||||
# 从配置加载 Coze input_data 模板
|
||||
input_data_template_str = CONFIG['Coze'].get('input_data_template',
|
||||
'{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
|
||||
try:
|
||||
# 解析模板字符串为字典
|
||||
input_data_template = json.loads(input_data_template_str)
|
||||
# 使用实际变量格式化模板
|
||||
input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
|
||||
input_data_template.items()}
|
||||
except (json.JSONDecodeError, KeyError, AttributeError) as e:
|
||||
logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
|
||||
input_data = {
|
||||
"article": article_text,
|
||||
"link": link,
|
||||
"weijin": weijin
|
||||
# logger.info("coze正在处理")
|
||||
# weijin = ""
|
||||
# if check_keywords:
|
||||
# weijin = "违禁"
|
||||
# # 从配置加载 Coze input_data 模板
|
||||
# input_data_template_str = CONFIG['Coze'].get('input_data_template',
|
||||
# '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
|
||||
# try:
|
||||
# # 解析模板字符串为字典
|
||||
# input_data_template = json.loads(input_data_template_str)
|
||||
# # 使用实际变量格式化模板
|
||||
# input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
|
||||
# input_data_template.items()}
|
||||
# except (json.JSONDecodeError, KeyError, AttributeError) as e:
|
||||
# logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
|
||||
# input_data = {
|
||||
# "article": article_text,
|
||||
# "link": link,
|
||||
# "weijin": weijin
|
||||
# }
|
||||
input_data = {
|
||||
"article": article_text
|
||||
}
|
||||
message_content = call_coze_article_workflow(input_data)
|
||||
|
||||
msg = call_coze_workflow(input_data)
|
||||
message_content = msg['article']
|
||||
result = msg['result']
|
||||
if result == "已经创作过":
|
||||
return
|
||||
# 获取当前时间并格式化
|
||||
current_time = datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
# 打印当前时间
|
||||
print("当前时间:", current_time)
|
||||
|
||||
finally_article = message_content.replace("正文:", "") + "\n"
|
||||
|
||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt")
|
||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
|
||||
|
||||
if '*' in finally_article or '#' in finally_article or "-" in finally_article:
|
||||
|
||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
|
||||
|
||||
if '*' in message_content or '#' in message_content or "-" in message_content:
|
||||
# 使用正则表达式一次性替换多个字符
|
||||
old_content = re.sub(r'[*#-]', '', message_content)
|
||||
else:
|
||||
# 如果不需要替换,直接使用原内容
|
||||
old_content = finally_article
|
||||
old_content = message_content
|
||||
|
||||
print("改写完成的文章:" + old_content)
|
||||
|
||||
# 删除AI词汇
|
||||
content = old_content
|
||||
|
||||
check_link_insert(host, user, password, database, link)
|
||||
|
||||
|
||||
# 判断文章合规度
|
||||
if text_detection(content) == "合规":
|
||||
@ -141,7 +144,7 @@ def process_link(link, ai_service):
|
||||
logging.info('文本已经保存')
|
||||
|
||||
if img_urls:
|
||||
download_and_process_images(img_urls, title)
|
||||
download_and_process_images(img_urls, file_name)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"处理链接 {link} 时出错: {e}")
|
||||
@ -163,13 +166,13 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify"
|
||||
|
||||
for link in links:
|
||||
logging.info(f"总共{len(links)}个链接")
|
||||
if check_link_exists(host, user, password, database, link):
|
||||
logger.info(f"链接已存在: {link}")
|
||||
continue
|
||||
else:
|
||||
filtered_links.append(link)
|
||||
logger.info(f"链接不存在: {link}")
|
||||
print("链接不存在,存储到过滤器中:", link)
|
||||
# if check_link_exists(host, user, password, database, link):
|
||||
# logger.info(f"链接已存在: {link}")
|
||||
# continue
|
||||
# else:
|
||||
filtered_links.append(link)
|
||||
# logger.info(f"链接不存在: {link}")
|
||||
# print("链接不存在,存储到过滤器中:", link)
|
||||
|
||||
if not filtered_links:
|
||||
logger.info("没有新链接需要处理")
|
||||
|
||||
@ -4,17 +4,24 @@ import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content
|
||||
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content
|
||||
|
||||
from utils import handle_duplicate_files_advanced
|
||||
|
||||
from images_edit import download_and_process_images
|
||||
|
||||
# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
|
||||
# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/")
|
||||
# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
|
||||
# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
|
||||
# print(imgs)
|
||||
# print(type(imgs))
|
||||
# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
|
||||
title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")
|
||||
|
||||
print(title)
|
||||
print(article)
|
||||
print(imgs)
|
||||
print(type(imgs))
|
||||
#
|
||||
# download_and_process_images(imgs,"1")
|
||||
|
||||
name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
|
||||
print(name[0])
|
||||
#
|
||||
# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
|
||||
# print(name[0])
|
||||
@ -145,9 +145,15 @@ def insert_images_into_paragraphs(paragraphs, image_folder, doc, title):
|
||||
:return:
|
||||
"""
|
||||
|
||||
if os.path.exists(image_folder):
|
||||
images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||
img.lower().endswith(('jpg'))])
|
||||
else:
|
||||
images = []
|
||||
|
||||
# 获取图片列表并排序
|
||||
images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||
img.lower().endswith(('jpg'))])
|
||||
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||
# img.lower().endswith(('jpg'))])
|
||||
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
|
||||
# # img.lower().endswith(('png', 'jpg', 'jpeg'))])
|
||||
|
||||
@ -271,7 +277,14 @@ def txt2docx(txt_path, image_path, keep_txt=True):
|
||||
else:
|
||||
new_text = text.replace("```markdown", "").replace("```", "")
|
||||
content = new_text
|
||||
image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".")
|
||||
# image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".")
|
||||
# image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip("."))
|
||||
from pathlib import Path
|
||||
from pathlib import Path
|
||||
|
||||
img_path = Path(img_path)
|
||||
image_folder = img_path / txt_name.replace(".txt", "").rstrip(".")
|
||||
|
||||
# crop_and_replace_images(image_folder)
|
||||
|
||||
create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user