30 lines
1.0 KiB
Python
30 lines
1.0 KiB
Python
"""
|
|
获取页面内容
|
|
"""
|
|
|
|
def wechat_extract_content(page):
|
|
"""
|
|
提取微信页面内容
|
|
"""
|
|
title_text = page.text_content(
|
|
'#activity-name')
|
|
article_text = page.text_content(
|
|
'#js_content')
|
|
|
|
return title_text, article_text
|
|
|
|
|
|
def toutiao_extract_content(page):
|
|
"""
|
|
提取页面内容
|
|
"""
|
|
title_text = page.text_content('#root > div.wtt-detail-container > div.main > div:nth-child(1) > div > div > div > div > article')
|
|
# root > div.wtt-detail-container > div.main > div:nth-child(1) > div > div > div > div > article
|
|
article_text = page.text_content(f'#root > div.wtt-detail-container > div.main > div:nth-child(1) > div > div > div > div > article')
|
|
# img_list = page.query_selector_all(r".pgc-img img")
|
|
# img_list = page.query_selector_all(r"#root > div.wtt-detail-container > div.main > div:nth-child(1) > div > div > div > div > article > div.image-list > img")
|
|
# img_urls = [img.get_attribute('src') for img in img_list]
|
|
return title_text, article_text
|
|
|
|
|