feed.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. import logging
  2. logging.basicConfig(
  3. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
  4. level=logging.DEBUG)
  5. import requests
  6. import time
  7. from bs4 import BeautifulSoup
  8. import itertools
  9. from urllib.parse import parse_qs
  10. import settings
  11. from feeds import hackernews, reddit, tildes, substack, lobsters
  12. from feeds.manual import manual
  13. from feeds.sitemap import Sitemap
  14. from feeds.category import Category
  15. from scrapers import outline
  16. from scrapers.declutter import declutter, declutterlite, headless, simple
  17. from utils import clean
  18. INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
  19. substacks = {}
  20. for key, value in settings.SUBSTACK.items():
  21. substacks[key] = substack.Publication(value['url'])
  22. categories = {}
  23. for key, value in settings.CATEGORY.items():
  24. categories[key] = Category(value)
  25. sitemaps = {}
  26. for key, value in settings.SITEMAP.items():
  27. sitemaps[key] = Sitemap(value)
  28. def get_source_ref(link, urldata):
  29. source = None
  30. ref = None
  31. if 'news.ycombinator.com' in urldata.hostname:
  32. source = 'hackernews'
  33. ref = parse_qs(urldata.query)['id'][0]
  34. elif 'tildes.net' in urldata.hostname and '~' in url:
  35. source = 'tildes'
  36. ref = urldata.path.split('/')[2]
  37. elif 'lobste.rs' in urldata.hostname and '/s/' in url:
  38. source = 'lobsters'
  39. ref = urldata.path.split('/')[2]
  40. elif 'reddit.com' in urldata.hostname and 'comments' in url:
  41. source = 'reddit'
  42. ref = urldata.path.split('/')[4]
  43. else:
  44. for key, sites in categories.items():
  45. if sites.is_match(urldata.hostname):
  46. ref = sites.get_id(link)
  47. return key, ref, link
  48. for key, sites in sitemaps.items():
  49. if sites.is_match(urldata.hostname):
  50. ref = sites.get_id(link)
  51. return key, ref, link
  52. return source, ref, ref
  53. def get_list():
  54. feeds = {}
  55. feeds['manual'] = [(x, 'manual', x) for x in manual.feed()]
  56. if settings.NUM_HACKERNEWS:
  57. feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
  58. if settings.NUM_LOBSTERS:
  59. feeds['lobsters'] = [(x, 'lobsters', x) for x in lobsters.feed()[:settings.NUM_LOBSTERS]]
  60. if settings.NUM_REDDIT:
  61. feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
  62. if settings.NUM_TILDES:
  63. feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
  64. if settings.NUM_SUBSTACK:
  65. feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
  66. for key, publication in substacks.items():
  67. count = settings.SUBSTACK[key]['count']
  68. feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
  69. for key, sites in categories.items():
  70. count = settings.CATEGORY[key].get('count') or 0
  71. excludes = settings.CATEGORY[key].get('excludes')
  72. tz = settings.CATEGORY[key].get('tz')
  73. feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
  74. for key, sites in sitemaps.items():
  75. count = settings.SITEMAP[key].get('count') or 0
  76. excludes = settings.SITEMAP[key].get('excludes')
  77. feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
  78. values = feeds.values()
  79. feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
  80. feed = list(filter(None, feed))
  81. return feed
  82. def get_article(url):
  83. scrapers = {
  84. 'headless': headless,
  85. 'simple': simple,
  86. 'outline': outline,
  87. 'declutter': declutter,
  88. 'declutterlite': declutterlite,
  89. }
  90. available = settings.SCRAPERS or ['headless', 'simple']
  91. if 'simple' not in available:
  92. available += ['simple']
  93. for scraper in available:
  94. if scraper not in scrapers.keys():
  95. continue
  96. try:
  97. details = scrapers[scraper].get_details(url)
  98. if details and details.get('content'):
  99. return details, scrapers[scraper].name
  100. except KeyboardInterrupt:
  101. raise
  102. except:
  103. pass
  104. return None, None
  105. def get_content_type(url):
  106. try:
  107. headers = {
  108. 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
  109. 'X-Forwarded-For': '66.249.66.1',
  110. }
  111. return requests.get(url, headers=headers, timeout=5).headers['content-type']
  112. except:
  113. pass
  114. try:
  115. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'}
  116. return requests.get(url, headers=headers, timeout=10).headers['content-type']
  117. except:
  118. return ''
  119. def update_story(story, is_manual=False, urlref=None):
  120. res = {}
  121. if story['source'] == 'hackernews':
  122. res = hackernews.story(story['ref'])
  123. elif story['source'] == 'lobsters':
  124. res = lobsters.story(story['ref'])
  125. elif story['source'] == 'reddit':
  126. res = reddit.story(story['ref'])
  127. elif story['source'] == 'tildes':
  128. res = tildes.story(story['ref'])
  129. elif story['source'] == 'substack':
  130. res = substack.top.story(story['ref'])
  131. elif story['source'] in categories.keys():
  132. res = categories[story['source']].story(story['ref'], urlref)
  133. elif story['source'] in sitemaps.keys():
  134. res = sitemaps[story['source']].story(story['ref'], urlref)
  135. elif story['source'] in substacks.keys():
  136. res = substacks[story['source']].story(story['ref'])
  137. elif story['source'] == 'manual':
  138. res = manual.story(story['ref'])
  139. if res:
  140. story.update(res) # join dicts
  141. else:
  142. logging.info('Story not ready yet')
  143. return False
  144. if story['date'] and not is_manual and story['date'] + settings.MAX_STORY_AGE < time.time():
  145. logging.info('Story too old, removing')
  146. return False
  147. has_url = story.get('url') or False
  148. has_text = story.get('text') or False
  149. text = story.get('text', '')
  150. more = ['more to come']
  151. needs_more = any([x in text.lower() for x in more])
  152. #is_simple = story.get('scaper', '') == 'simple'
  153. if has_url and (not has_text or needs_more):
  154. if not get_content_type(story['url']).startswith('text/'):
  155. logging.info('URL invalid file type / content type:')
  156. logging.info(story['url'])
  157. return False
  158. if any([domain in story['url'] for domain in INVALID_DOMAINS]):
  159. logging.info('URL invalid domain:')
  160. logging.info(story['url'])
  161. return False
  162. logging.info('Getting article ' + story['url'])
  163. details, scraper = get_article(story['url'])
  164. if not details: return False
  165. if not story['title']:
  166. story['title'] = clean(details.get('title', ''))
  167. story['scraper'] = scraper
  168. story['text'] = details.get('content', '')
  169. if not story['text']: return False
  170. story['last_update'] = time.time()
  171. story['excerpt'] = details.get('excerpt', '')
  172. story['scraper_link'] = details.get('scraper_link', '')
  173. meta = details.get('meta')
  174. if meta:
  175. og = meta.get('og')
  176. story['image'] = meta.get('image', '')
  177. if og:
  178. story['image'] = og.get('og:image', meta.get('image', ''))
  179. links = meta.get('links', [])
  180. if links:
  181. story['meta_links'] = links
  182. #manual.add_links(links)
  183. return True
  184. if __name__ == '__main__':
  185. #test_news_cache = {}
  186. #nid = 'jean'
  187. #ref = 20802050
  188. #source = 'hackernews'
  189. #test_news_cache[nid] = dict(id=nid, ref=ref, source=source)
  190. #news_story = test_news_cache[nid]
  191. #update_story(news_story)
  192. #print(get_article('https://www.bloomberg.com/news/articles/2019-09-23/xi-s-communists-under-pressure-as-high-prices-hit-china-workers'))
  193. a = get_article('https://blog.joinmastodon.org/2019/10/mastodon-3.0/')
  194. print(a)
  195. print('done')