# 传入url,指定 language的效果比较好 from newspaper import Article In [10]: article = Article(url, language='zh') In [11]: article.download() In [12]: article.parse()
In [21]: article.html Out[21]: '<!DOCTYPE html><html><head> <meta charset="UTF-8"> In [23]: article.title Out[23]: '大公司头条:拼多多年活跃买家接近阿里;三星发布新款移动处理器,抢占 5G 芯片份额' In [24]: article.text Out[24]: '我们每天为你摘取最重要的商业新闻 In [25]: article.publish_date In [29]: article.authors Out[29]: []
# 传入文本 from newspaper import fulltext >>> html = requests.get(...).text >>> article = fulltext(html)
# 列表页提取,效果不太理想 In [33]: paper = newspaper.build(home_url, language='zh')
In [34]: list(paper.category_urls()) Out[34]: [] In [36]: paper.articles Out[36]: [] In [37]: paper.article_urls() Out[37]: []