from bs4 import BeautifulSoup import time import random from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import requests def extract_images_from_html(html_content): soup = BeautifulSoup(html_content, 'html.parser') # 匹配所有以 https://p3-sign.toutiaoi 开头的图片链接 img_tags = soup.find_all('img') img_urls = [] for img in img_tags: for attr in ['src', 'data-src']: url = img.get(attr) if url and url.startswith("https://p3-sign.toutiaoimg.com/tos-cn-i"): img_urls.append(url) # 去重处理 img_urls = list(dict.fromkeys(img_urls)) # 返回 JSON 格式 return {"image": img_urls} # ============================================================ def get_webpage_source(url): """ 获取网页源代码的通用函数 """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } try: # 添加随机延迟,模拟人类行为 time.sleep(random.uniform(1, 3)) response = requests.get(url, headers=headers, timeout=10) response.encoding = 'utf-8' # 检查响应状态 if response.status_code == 200: return response.text else: print(f"请求失败,状态码: {response.status_code}") return None except Exception as e: print(f"获取网页源代码时出错: {e}") return None # def get_webpage_source_selenium(url): # """ # 使用Selenium获取网页源代码,适用于动态加载内容的网站 # """ # # 配置Chrome选项 # chrome_options = Options() # chrome_options.add_argument('--headless') # 无头模式 # chrome_options.add_argument('--disable-gpu') # chrome_options.add_argument('--no-sandbox') # chrome_options.add_argument('--disable-dev-shm-usage') # chrome_options.add_argument('--disable-blink-features=AutomationControlled') # chrome_options.add_argument( # 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') # # # 初始化WebDriver # driver = webdriver.Chrome(options=chrome_options) # # try: # # 访问URL # driver.get(url) # # # 等待页面加载完成(可根据实际情况调整等待条件) # time.sleep(3) # 简单等待3秒 # # # 尝试等待文章内容加载 # try: # WebDriverWait(driver, 10).until( # EC.presence_of_element_located((By.TAG_NAME, "article")) # ) # except: # print("等待文章元素超时,将使用当前页面内容") # # # 获取页面源代码 # page_source = driver.page_source # # # 保存源代码到文件 # with open("toutiao_source_selenium.html", "w", encoding="utf-8") as f: # f.write(page_source) # # return page_source # except Exception as e: # print(f"使用Selenium获取网页源代码时出错: {e}") # return None # finally: # # 关闭浏览器 # driver.quit() # =====================采集内容内容================================== # def toutiao_w_extract_content(url): # """ # 使用requests和BeautifulSoup提取头条页面内容 # """ # html_content = get_webpage_source_selenium(url) # # # 使用BeautifulSoup解析HTML # soup = BeautifulSoup(html_content, 'html.parser') # # # 提取标题和文章内容 # article_element = soup.select_one('article') # # if not article_element: # # 尝试其他可能的选择器 # article_element = soup.select_one('.article-content') or soup.select_one('.content') # # title_element = soup.select_one('h1') or soup.select_one('.article-title') # title_text = title_element.get_text().strip() if title_element else "" # article_text = article_element.get_text().strip() if article_element else "" # # # 提取图片URL # img_elements = article_element.select('img') if article_element else [] # img_urls = [img.get('src') for img in img_elements if img.get('src')] # # return title_text, article_text, img_urls def toutiao_extract_content(url): """ 使用requests和BeautifulSoup提取头条页面内容 """ html_content = get_webpage_source_selenium(url) # 使用BeautifulSoup解析HTML soup = BeautifulSoup(html_content, 'html.parser') # 提取标题和文章内容 title_selector = '#root > div.article-detail-container > div.main > div.show-monitor > div > div > div > div > h1' article_selector = '#root > div.article-detail-container > div.main > div.show-monitor > div > div > div > div > div > article' title_element = soup.select_one(title_selector) article_element = soup.select_one(article_selector) title_text = title_element.get_text().strip() if title_element else "" article_text = article_element.get_text().strip() if article_element else "" # 提取图片URL # img_selector = "#root > div.article-detail-container > div.main > div.show-monitor article img" # img_elements = soup.select(img_selector) # # img_elements = article_element.select('img') if article_element else [] img_urls = extract_images_from_html(html_content)['image'] # img_urls = [img.get('src') for img in img_elements if img.get('src').startswith("https://p3")] return title_text, article_text, img_urls def wechat_extract_content(url): """ 使用requests和BeautifulSoup提取微信公众号页面内容 """ html_content = get_webpage_source_selenium(url) # 使用BeautifulSoup解析HTML soup = BeautifulSoup(html_content, 'html.parser') # 使用指定的选择器提取标题和文章内容 title_element = soup.select_one('#activity-name') article_element = soup.select_one('#js_content') title_text = title_element.get_text().strip() if title_element else "" article_text = article_element.get_text().strip() if article_element else "" # 提取特定 section 中的图片 URL(仅保留以 https://mmbiz.qpic.cn 开头的) img_elements = article_element.select('img') if article_element else [] img_urls = [] for img in img_elements: src = img.get('src') or img.get('data-src') if src and src.startswith('https://mmbiz.qpic.cn'): img_urls.append(src) return title_text, article_text, img_urls def wangyi_extract_content(url): """ 使用requests和BeautifulSoup提取头条页面内容 """ html_content = get_webpage_source_selenium(url) # 使用BeautifulSoup解析HTML soup = BeautifulSoup(html_content, 'html.parser') # 提取标题和文章内容 title_selector = '#contain > div.post_main > h1' article_selector = '#content > div.post_body' # img_selector = "#content > div.post_body > p > img" title_element = soup.select_one(title_selector) article_element = soup.select_one(article_selector) title_text = title_element.get_text().strip() if title_element else "" article_text = article_element.get_text().strip() if article_element else "" # 提取图片URL img_selector = "#content > div.post_body > p > img" img_elements = soup.select(img_selector) img_elements = article_element.select('img') if article_element else [] # img_urls = extract_images_from_html(html_content)['image'] img_urls = [img.get('src') for img in img_elements if img.get('src')] return title_text, article_text, img_urls def souhu_extract_content(url): """ 使用requests和BeautifulSoup提取头条页面内容 """ html_content = get_webpage_source_selenium(url) # 使用BeautifulSoup解析HTML soup = BeautifulSoup(html_content, 'html.parser') print(soup) # 提取标题和文章内容 title_selector = '#article-container > div.left.main > div:nth-child(1) > div > div.text-title > h1' article_selector = '#mp-editor' # img_selector = "#content > div.post_body > p > img" title_element = soup.select_one(title_selector) article_element = soup.select_one(article_selector) title_text = title_element.get_text().strip() if title_element else "" article_text = article_element.get_text().strip() if article_element else "" # 提取图片URL # img_selector = "#mp-editor > p > img" # img_elements = soup.select(img_selector) img_elements = article_element.select('img') if article_element else [] img_urls = [img.get('src') for img in img_elements if img.get('src')] return title_text, article_text, img_urls def toutiao_w_extract_content(url): """ 优化后的头条页面内容提取函数 专门获取文章内容中的图片链接 """ html_content = get_webpage_source_selenium(url) if not html_content: print("获取HTML内容失败") return "", "", [] # 使用BeautifulSoup解析HTML soup = BeautifulSoup(html_content, 'html.parser') # 多种标题选择器,按优先级尝试 title_selectors = [ '#root > div.article-detail-container > div.main > div.show-monitor > div > div > div > div > h1', 'h1.article-title', 'h1[data-testid="headline"]', '.article-title h1', '.article-header h1', 'article h1', 'h1' ] title_text = "" for selector in title_selectors: title_element = soup.select_one(selector) if title_element: title_text = title_element.get_text().strip() break # 多种文章内容选择器,按优先级尝试 article_selectors = [ '#root > div.article-detail-container > div.main > div.show-monitor > div > div > div > div > div > article', 'article', '.article-content', '.content', '#js_content', '.post_body', '[data-testid="article-content"]' ] article_text = "" article_element = None for selector in article_selectors: article_element = soup.select_one(selector) if article_element: article_text = article_element.get_text().strip() break # 只从文章内容中提取图片 img_urls = [] if article_element: # 查找文章内容中的所有图片元素 img_elements = article_element.find_all('img') for img in img_elements: # 尝试多种可能的图片URL属性 for attr in ['src', 'data-src', 'data-original', 'data-lazy-src']: url = img.get(attr) if url: # 处理相对路径 if url.startswith('//'): url = 'https:' + url elif url.startswith('/'): url = 'https://www.toutiao.com' + url # 只收集头条相关的图片URL if any(domain in url for domain in ['toutiaoimg.com', 'p3-sign.toutiaoimg.com', 'byteimg.com']): img_urls.append(url) break # 找到一个有效URL就跳出内层循环 # 如果上面没有找到图片,尝试使用现有的extract_images_from_html函数作为备选 if not img_urls: extracted_imgs = extract_images_from_html(html_content) if extracted_imgs and 'image' in extracted_imgs: img_urls = extracted_imgs['image'] # 去重处理 img_urls = list(dict.fromkeys(img_urls)) return title_text, article_text, img_urls def get_webpage_source_selenium(url): """ 增强版的Selenium获取网页源代码函数 专门针对头条网站的动态加载特性进行优化 """ chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_argument('--disable-images') # 禁用图片加载以提高速度 chrome_options.add_argument('--disable-javascript') # 如果不需要JS,可以禁用 chrome_options.add_argument( 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') driver = webdriver.Chrome(options=chrome_options) try: driver.get(url) # 等待页面加载完成 time.sleep(5) # 尝试等待关键元素加载 wait = WebDriverWait(driver, 15) try: # 等待文章标题加载 wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) # 等待文章内容加载 wait.until(EC.presence_of_element_located((By.TAG_NAME, "article"))) except: print("等待关键元素超时,使用当前页面内容") # 滚动页面以触发懒加载 driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);") time.sleep(2) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) driver.execute_script("window.scrollTo(0, 0);") time.sleep(1) page_source = driver.page_source # # 保存源代码用于调试 # with open("toutiao_source_enhanced.html", "w", encoding="utf-8") as f: # f.write(page_source) return page_source except Exception as e: print(f"使用增强版Selenium获取网页源代码时出错: {e}") return None finally: driver.quit()