修改获取网页内容代码
This commit is contained in:
parent
113c97c887
commit
5397e7cfc2
@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk):
|
||||
|
||||
# AI服务提供商选择
|
||||
ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
|
||||
self.ai_service_var = tk.StringVar(value="dify")
|
||||
self.ai_service_var = tk.StringVar(value="coze")
|
||||
ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
|
||||
ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)
|
||||
|
||||
@ -83,11 +83,18 @@ def download_image(image_url, save_path):
|
||||
print(f"请求出错:{e}")
|
||||
|
||||
|
||||
def download_and_process_images(img_urls, article_title):
|
||||
def download_and_process_images(img_urls, article_title, save_dir=None):
|
||||
"""
|
||||
下载并处理图片
|
||||
:param img_urls: 图片URL列表
|
||||
:param article_title: 文章标题
|
||||
:param save_dir: 自定义保存目录,如果为None则使用默认目录
|
||||
"""
|
||||
img_dir_path = os.path.join(IMGS_BASE_PATH, article_title)
|
||||
if save_dir is None:
|
||||
save_dir = IMGS_BASE_PATH
|
||||
|
||||
img_dir_path = os.path.join(str(save_dir), str(article_title))
|
||||
logger.info(f"图片保存路径:{img_dir_path}")
|
||||
safe_open_directory(img_dir_path)
|
||||
|
||||
for i, img_url in enumerate(img_urls):
|
||||
|
||||
@ -26,6 +26,8 @@ def process_link(link, ai_service):
|
||||
else:
|
||||
title_text, article_text, img_urls = "", "", []
|
||||
|
||||
print(title_text)
|
||||
|
||||
if title_text == "":
|
||||
return
|
||||
elif len(title_text) > 100:
|
||||
@ -111,7 +113,7 @@ def process_link(link, ai_service):
|
||||
|
||||
# finally_article = message_content.replace("正文:", "") + "\n"
|
||||
|
||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)
|
||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
|
||||
|
||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@ import threading
|
||||
import queue
|
||||
import json # 导入 json 模块
|
||||
|
||||
from ai_studio import call_dify_workflow, call_coze_workflow
|
||||
from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
|
||||
from databases import *
|
||||
|
||||
from images_edit import download_and_process_images
|
||||
@ -12,7 +12,8 @@ from config import *
|
||||
|
||||
|
||||
# ==============================主程序===========================
|
||||
def process_link(link, ai_service):
|
||||
def process_link(link_info, ai_service):
|
||||
link, article_type = link_info # 解包链接和类型信息
|
||||
"""
|
||||
处理单个链接
|
||||
:param link: 要处理的链接
|
||||
@ -25,6 +26,8 @@ def process_link(link, ai_service):
|
||||
title_text, article_text, img_urls = toutiao_extract_content(link)
|
||||
elif link.startswith("https://mp.weixin.qq.co"):
|
||||
title_text, article_text, img_urls = wechat_extract_content(link)
|
||||
elif link.startswith("https://www.163.com"):
|
||||
title_text, article_text, img_urls = wangyi_extract_content(link)
|
||||
else:
|
||||
title_text, article_text, img_urls = "", "", []
|
||||
|
||||
@ -44,7 +47,7 @@ def process_link(link, ai_service):
|
||||
|
||||
title = extract_content_until_punctuation(article_text).replace("正文:", "")
|
||||
|
||||
print(title)
|
||||
print(img_urls)
|
||||
print(article_text)
|
||||
|
||||
from datetime import datetime
|
||||
@ -78,55 +81,61 @@ def process_link(link, ai_service):
|
||||
# }
|
||||
message_content = call_dify_workflow(input_data)
|
||||
elif ai_service == "coze":
|
||||
logger.info("coze正在处理")
|
||||
weijin = ""
|
||||
if check_keywords:
|
||||
weijin = "违禁"
|
||||
# 从配置加载 Coze input_data 模板
|
||||
input_data_template_str = CONFIG['Coze'].get('input_data_template',
|
||||
'{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
|
||||
try:
|
||||
# 解析模板字符串为字典
|
||||
input_data_template = json.loads(input_data_template_str)
|
||||
# 使用实际变量格式化模板
|
||||
input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
|
||||
input_data_template.items()}
|
||||
except (json.JSONDecodeError, KeyError, AttributeError) as e:
|
||||
logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
|
||||
input_data = {
|
||||
"article": article_text,
|
||||
"link": link,
|
||||
"weijin": weijin
|
||||
# logger.info("coze正在处理")
|
||||
# weijin = ""
|
||||
# if check_keywords:
|
||||
# weijin = "违禁"
|
||||
# # 从配置加载 Coze input_data 模板
|
||||
# input_data_template_str = CONFIG['Coze'].get('input_data_template',
|
||||
# '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
|
||||
# try:
|
||||
# # 解析模板字符串为字典
|
||||
# input_data_template = json.loads(input_data_template_str)
|
||||
# # 使用实际变量格式化模板
|
||||
# input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
|
||||
# input_data_template.items()}
|
||||
# except (json.JSONDecodeError, KeyError, AttributeError) as e:
|
||||
# logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
|
||||
# input_data = {
|
||||
# "article": article_text,
|
||||
# "link": link,
|
||||
# "weijin": weijin
|
||||
# }
|
||||
input_data = {
|
||||
"article": article_text
|
||||
}
|
||||
message_content = call_coze_article_workflow(input_data)
|
||||
|
||||
msg = call_coze_workflow(input_data)
|
||||
message_content = msg['article']
|
||||
result = msg['result']
|
||||
if result == "已经创作过":
|
||||
return
|
||||
# 获取当前时间并格式化
|
||||
current_time = datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
# 打印当前时间
|
||||
print("当前时间:", current_time)
|
||||
|
||||
finally_article = message_content.replace("正文:", "") + "\n"
|
||||
|
||||
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt")
|
||||
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
|
||||
|
||||
if '*' in finally_article or '#' in finally_article or "-" in finally_article:
|
||||
|
||||
# 创建类型目录
|
||||
type_dir = os.path.join(ARTICLES_BASE_PATH, article_type)
|
||||
safe_open_directory(type_dir)
|
||||
|
||||
# 在类型目录下保存文章
|
||||
article_save_path = os.path.join(type_dir, f"{file_name}.txt")
|
||||
|
||||
if '*' in message_content or '#' in message_content or "-" in message_content:
|
||||
# 使用正则表达式一次性替换多个字符
|
||||
old_content = re.sub(r'[*#-]', '', message_content)
|
||||
else:
|
||||
# 如果不需要替换,直接使用原内容
|
||||
old_content = finally_article
|
||||
old_content = message_content
|
||||
|
||||
print("改写完成的文章:" + old_content)
|
||||
|
||||
# 删除AI词汇
|
||||
content = old_content
|
||||
|
||||
check_link_insert(host, user, password, database, link)
|
||||
|
||||
|
||||
# 判断文章合规度
|
||||
if text_detection(content) == "合规":
|
||||
@ -141,7 +150,10 @@ def process_link(link, ai_service):
|
||||
logging.info('文本已经保存')
|
||||
|
||||
if img_urls:
|
||||
download_and_process_images(img_urls, title)
|
||||
# 在类型目录下创建图片目录
|
||||
type_picture_dir = os.path.join(PICTURE_BASE_PATH, article_type)
|
||||
safe_open_directory(type_picture_dir)
|
||||
download_and_process_images(img_urls, file_name, type_picture_dir)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"处理链接 {link} 时出错: {e}")
|
||||
@ -161,15 +173,16 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify"
|
||||
password = CONFIG['Database']['password']
|
||||
database = CONFIG['Database']['database']
|
||||
|
||||
for link in links:
|
||||
for link_info in links:
|
||||
link = link_info[0] # 获取链接
|
||||
logging.info(f"总共{len(links)}个链接")
|
||||
if check_link_exists(host, user, password, database, link):
|
||||
logger.info(f"链接已存在: {link}")
|
||||
continue
|
||||
else:
|
||||
filtered_links.append(link)
|
||||
logger.info(f"链接不存在: {link}")
|
||||
print("链接不存在,存储到过滤器中:", link)
|
||||
# if check_link_exists(host, user, password, database, link):
|
||||
# logger.info(f"链接已存在: {link}")
|
||||
# continue
|
||||
# else:
|
||||
filtered_links.append(link)
|
||||
# logger.info(f"链接不存在: {link}")
|
||||
# print("链接不存在,存储到过滤器中:", link)
|
||||
|
||||
if not filtered_links:
|
||||
logger.info("没有新链接需要处理")
|
||||
|
||||
@ -4,17 +4,24 @@ import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content
|
||||
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content
|
||||
|
||||
from utils import handle_duplicate_files_advanced
|
||||
|
||||
from images_edit import download_and_process_images
|
||||
|
||||
# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
|
||||
# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/")
|
||||
# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
|
||||
# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
|
||||
# print(imgs)
|
||||
# print(type(imgs))
|
||||
# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
|
||||
title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")
|
||||
|
||||
print(title)
|
||||
print(article)
|
||||
print(imgs)
|
||||
print(type(imgs))
|
||||
#
|
||||
# download_and_process_images(imgs,"1")
|
||||
|
||||
name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
|
||||
print(name[0])
|
||||
#
|
||||
# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
|
||||
# print(name[0])
|
||||
@ -271,7 +271,14 @@ def txt2docx(txt_path, image_path, keep_txt=True):
|
||||
else:
|
||||
new_text = text.replace("```markdown", "").replace("```", "")
|
||||
content = new_text
|
||||
image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".")
|
||||
# image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".")
|
||||
# image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip("."))
|
||||
from pathlib import Path
|
||||
from pathlib import Path
|
||||
|
||||
img_path = Path(img_path)
|
||||
image_folder = img_path / txt_name.replace(".txt", "").rstrip(".")
|
||||
|
||||
# crop_and_replace_images(image_folder)
|
||||
|
||||
create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)
|
||||
|
||||
@ -89,14 +89,21 @@ def extract_content_until_punctuation(text, punctuations=r'[,。!?;]'):
|
||||
|
||||
|
||||
|
||||
# 读取Excel表格某一列的内容并将内容以列表的形式返回
|
||||
# 读取Excel表格链接列和类型列的内容并将内容以元组列表的形式返回
|
||||
def read_excel(file_name):
|
||||
datas = pd.read_excel(file_name)
|
||||
first_column_name = datas.columns[0]
|
||||
first_colunm_data = datas[first_column_name].tolist()
|
||||
print(first_colunm_data)
|
||||
first_column_name = datas.columns[0] # 链接列
|
||||
type_column_name = '类型' # 类型列
|
||||
|
||||
return first_colunm_data
|
||||
links = datas[first_column_name].tolist()
|
||||
# 如果存在类型列就读取,不存在则为默认类型
|
||||
types = datas[type_column_name].tolist() if type_column_name in datas.columns else ['默认'] * len(links)
|
||||
|
||||
# 将链接和类型组合成元组列表
|
||||
result = list(zip(links, types))
|
||||
print(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user