headless.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import logging
  2. logging.basicConfig(
  3. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
  4. level=logging.DEBUG)
  5. import requests
  6. from settings import READER_PORT
  7. READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000)
  8. READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000)
  9. TIMEOUT = 60
  10. def get_html(url):
  11. logging.info(f"Headless Browser Scraper: {url}")
  12. details = get_details(url)
  13. if not details:
  14. return ''
  15. return details['content']
  16. def get_details(url):
  17. try:
  18. r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
  19. if r.status_code != 200:
  20. raise Exception('Bad response code ' + str(r.status_code))
  21. return r.json()
  22. except KeyboardInterrupt:
  23. raise
  24. except BaseException as e:
  25. logging.error('Problem scraping article: {}'.format(str(e)))
  26. return None
  27. def get_comments(url):
  28. try:
  29. r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
  30. if r.status_code != 200:
  31. raise Exception('Bad response code ' + str(r.status_code))
  32. return r.json()
  33. except KeyboardInterrupt:
  34. raise
  35. except BaseException as e:
  36. logging.error('Problem getting comments for article: {}'.format(str(e)))
  37. return None