修改获取网页内容代码

This commit is contained in:
wsb1224 2025-05-26 09:23:17 +08:00
parent 113c97c887
commit 5397e7cfc2
7 changed files with 102 additions and 59 deletions

View File

@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk):
# AI服务提供商选择
ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
self.ai_service_var = tk.StringVar(value="dify")
self.ai_service_var = tk.StringVar(value="coze")
ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)

View File

@ -83,11 +83,18 @@ def download_image(image_url, save_path):
print(f"请求出错:{e}")
def download_and_process_images(img_urls, article_title):
def download_and_process_images(img_urls, article_title, save_dir=None):
"""
下载并处理图片
:param img_urls: 图片URL列表
:param article_title: 文章标题
:param save_dir: 自定义保存目录如果为None则使用默认目录
"""
img_dir_path = os.path.join(IMGS_BASE_PATH, article_title)
if save_dir is None:
save_dir = IMGS_BASE_PATH
img_dir_path = os.path.join(str(save_dir), str(article_title))
logger.info(f"图片保存路径:{img_dir_path}")
safe_open_directory(img_dir_path)
for i, img_url in enumerate(img_urls):

View File

@ -26,6 +26,8 @@ def process_link(link, ai_service):
else:
title_text, article_text, img_urls = "", "", []
print(title_text)
if title_text == "":
return
elif len(title_text) > 100:
@ -111,7 +113,7 @@ def process_link(link, ai_service):
# finally_article = message_content.replace("正文:", "") + "\n"
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")

View File

@ -2,7 +2,7 @@ import threading
import queue
import json # 导入 json 模块
from ai_studio import call_dify_workflow, call_coze_workflow
from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
from databases import *
from images_edit import download_and_process_images
@ -12,7 +12,8 @@ from config import *
# ==============================主程序===========================
def process_link(link, ai_service):
def process_link(link_info, ai_service):
link, article_type = link_info # 解包链接和类型信息
"""
处理单个链接
:param link: 要处理的链接
@ -25,6 +26,8 @@ def process_link(link, ai_service):
title_text, article_text, img_urls = toutiao_extract_content(link)
elif link.startswith("https://mp.weixin.qq.co"):
title_text, article_text, img_urls = wechat_extract_content(link)
elif link.startswith("https://www.163.com"):
title_text, article_text, img_urls = wangyi_extract_content(link)
else:
title_text, article_text, img_urls = "", "", []
@ -44,7 +47,7 @@ def process_link(link, ai_service):
title = extract_content_until_punctuation(article_text).replace("正文:", "")
print(title)
print(img_urls)
print(article_text)
from datetime import datetime
@ -78,55 +81,61 @@ def process_link(link, ai_service):
# }
message_content = call_dify_workflow(input_data)
elif ai_service == "coze":
logger.info("coze正在处理")
weijin = ""
if check_keywords:
weijin = "违禁"
# 从配置加载 Coze input_data 模板
input_data_template_str = CONFIG['Coze'].get('input_data_template',
'{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
try:
# 解析模板字符串为字典
input_data_template = json.loads(input_data_template_str)
# 使用实际变量格式化模板
input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
input_data_template.items()}
except (json.JSONDecodeError, KeyError, AttributeError) as e:
logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
input_data = {
"article": article_text,
"link": link,
"weijin": weijin
# logger.info("coze正在处理")
# weijin = ""
# if check_keywords:
# weijin = "违禁"
# # 从配置加载 Coze input_data 模板
# input_data_template_str = CONFIG['Coze'].get('input_data_template',
# '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
# try:
# # 解析模板字符串为字典
# input_data_template = json.loads(input_data_template_str)
# # 使用实际变量格式化模板
# input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
# input_data_template.items()}
# except (json.JSONDecodeError, KeyError, AttributeError) as e:
# logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
# input_data = {
# "article": article_text,
# "link": link,
# "weijin": weijin
# }
input_data = {
"article": article_text
}
message_content = call_coze_article_workflow(input_data)
msg = call_coze_workflow(input_data)
message_content = msg['article']
result = msg['result']
if result == "已经创作过":
return
# 获取当前时间并格式化
current_time = datetime.now().strftime("%H:%M:%S")
# 打印当前时间
print("当前时间:", current_time)
finally_article = message_content.replace("正文:", "") + "\n"
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt")
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
if '*' in finally_article or '#' in finally_article or "-" in finally_article:
# 创建类型目录
type_dir = os.path.join(ARTICLES_BASE_PATH, article_type)
safe_open_directory(type_dir)
# 在类型目录下保存文章
article_save_path = os.path.join(type_dir, f"{file_name}.txt")
if '*' in message_content or '#' in message_content or "-" in message_content:
# 使用正则表达式一次性替换多个字符
old_content = re.sub(r'[*#-]', '', message_content)
else:
# 如果不需要替换,直接使用原内容
old_content = finally_article
old_content = message_content
print("改写完成的文章:" + old_content)
# 删除AI词汇
content = old_content
check_link_insert(host, user, password, database, link)
# 判断文章合规度
if text_detection(content) == "合规":
@ -141,7 +150,10 @@ def process_link(link, ai_service):
logging.info('文本已经保存')
if img_urls:
download_and_process_images(img_urls, title)
# 在类型目录下创建图片目录
type_picture_dir = os.path.join(PICTURE_BASE_PATH, article_type)
safe_open_directory(type_picture_dir)
download_and_process_images(img_urls, file_name, type_picture_dir)
except Exception as e:
logging.error(f"处理链接 {link} 时出错: {e}")
@ -161,15 +173,16 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify"
password = CONFIG['Database']['password']
database = CONFIG['Database']['database']
for link in links:
for link_info in links:
link = link_info[0] # 获取链接
logging.info(f"总共{len(links)}个链接")
if check_link_exists(host, user, password, database, link):
logger.info(f"链接已存在: {link}")
continue
else:
filtered_links.append(link)
logger.info(f"链接不存在: {link}")
print("链接不存在,存储到过滤器中:", link)
# if check_link_exists(host, user, password, database, link):
# logger.info(f"链接已存在: {link}")
# continue
# else:
filtered_links.append(link)
# logger.info(f"链接不存在: {link}")
# print("链接不存在,存储到过滤器中:", link)
if not filtered_links:
logger.info("没有新链接需要处理")

View File

@ -4,17 +4,24 @@ import requests
from bs4 import BeautifulSoup
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content
from utils import handle_duplicate_files_advanced
from images_edit import download_and_process_images
# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/")
# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
# print(imgs)
# print(type(imgs))
# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")
print(title)
print(article)
print(imgs)
print(type(imgs))
#
# download_and_process_images(imgs,"1")
name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
print(name[0])
#
# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
# print(name[0])

View File

@ -271,7 +271,14 @@ def txt2docx(txt_path, image_path, keep_txt=True):
else:
new_text = text.replace("```markdown", "").replace("```", "")
content = new_text
image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".")
# image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".")
# image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip("."))
from pathlib import Path
from pathlib import Path
img_path = Path(img_path)
image_folder = img_path / txt_name.replace(".txt", "").rstrip(".")
# crop_and_replace_images(image_folder)
create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)

View File

@ -89,14 +89,21 @@ def extract_content_until_punctuation(text, punctuations=r'[,。!?;]'):
# 读取Excel表格某一列的内容并将内容以列表的形式返回
# 读取Excel表格链接列和类型列的内容并将内容以元组列表的形式返回
def read_excel(file_name):
datas = pd.read_excel(file_name)
first_column_name = datas.columns[0]
first_colunm_data = datas[first_column_name].tolist()
print(first_colunm_data)
first_column_name = datas.columns[0] # 链接列
type_column_name = '类型' # 类型列
return first_colunm_data
links = datas[first_column_name].tolist()
# 如果存在类型列就读取,不存在则为默认类型
types = datas[type_column_name].tolist() if type_column_name in datas.columns else ['默认'] * len(links)
# 将链接和类型组合成元组列表
result = list(zip(links, types))
print(result)
return result