Browse Source

Extend manual to detect sitemap, category urls.

Jason Schwarzenberger 1 month ago
parent
commit
3acc1b3a7e
5 changed files with 54 additions and 16 deletions
  1. 31 0
      apiserver/feed.py
  2. 1 0
      apiserver/feeds/category.py
  3. 1 0
      apiserver/feeds/sitemap.py
  4. 11 0
      apiserver/misc/news.py
  5. 10 16
      apiserver/server.py

+ 31 - 0
apiserver/feed.py

@@ -7,6 +7,7 @@ import requests
 import time
 from bs4 import BeautifulSoup
 import itertools
+from urllib.parse import parse_qs
 
 import settings
 from feeds import hackernews, reddit, tildes, substack, lobsters
@@ -17,6 +18,7 @@ from scrapers import outline
 from scrapers.declutter import declutter, declutterlite, headless, simple
 from utils import clean
 
+
 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com', 'sec.gov']
 
 substacks = {}
@@ -29,6 +31,35 @@ sitemaps = {}
 for key, value in settings.SITEMAP.items():
     sitemaps[key] = Sitemap(value)
 
+
+def get_source_ref(link, urldata):
+    source = None
+    ref = None
+    if 'news.ycombinator.com' in urldata.hostname:
+        source = 'hackernews'
+        ref = parse_qs(urldata.query)['id'][0]
+    elif 'tildes.net' in urldata.hostname and '~' in url:
+        source = 'tildes'
+        ref = urldata.path.split('/')[2]
+    elif 'lobste.rs' in urldata.hostname and '/s/' in url:
+        source = 'lobsters'
+        ref = urldata.path.split('/')[2]
+    elif 'reddit.com' in urldata.hostname and 'comments' in url:
+        source = 'reddit'
+        ref = urldata.path.split('/')[4]
+    else:
+        for key, sites in categories.items():
+            if sites.is_match(urldata.hostname):
+                ref = sites.get_id(link)
+                return key, ref, link
+        for key, sites in sitemaps.items():
+            if sites.is_match(urldata.hostname):
+                ref = sites.get_id(link)
+                return key, ref, link
+
+    return source, ref, ref
+
+
 def get_list():
     feeds = {}
 

+ 1 - 0
apiserver/feeds/category.py

@@ -35,6 +35,7 @@ def _get_category(category_url, excludes=None):
 
 class Category(Base):
     def __init__(self, config):
+        super().__init__(config)
         self.config = config
         self.category_url = config.get('url')
         self.tz = config.get('tz')

+ 1 - 0
apiserver/feeds/sitemap.py

@@ -59,6 +59,7 @@ def _get_sitemap(feed_url, excludes=None):
 
 class Sitemap(Base):
     def __init__(self, config):
+        super().__init__(config)
         self.config = config
         self.sitemap_url = config.get('url')
         self.tz = config.get('tz')

+ 11 - 0
apiserver/misc/news.py

@@ -8,6 +8,7 @@ import requests
 from bs4 import BeautifulSoup
 from scrapers.declutter import headless
 import extruct
+from urllib.parse import urlparse
 
 import settings
 from utils import clean
@@ -44,6 +45,16 @@ class Base:
             return link
         return patterns[0]
 
+    def is_match(self, hostname):
+        primary = []
+        if isinstance(self.url, str):
+            primary = [self.url]
+        elif isinstance(self.url, list):
+            primary = self.url
+
+        primary = [urlparse(url).hostname for url in primary]
+        return hostname in primary
+
     def feed(self, excludes=None):
         return []
 

+ 10 - 16
apiserver/server.py

@@ -64,25 +64,17 @@ def submit():
     try:
         url = request.form['url']
         nid = new_id()
-
         parse = urlparse(url)
-        if 'news.ycombinator.com' in parse.hostname:
-            source = 'hackernews'
-            ref = parse_qs(parse.query)['id'][0]
-        elif 'tildes.net' in parse.hostname and '~' in url:
-            source = 'tildes'
-            ref = parse.path.split('/')[2]
-        elif 'lobste.rs' in parse.hostname and '/s/' in url:
-            source = 'lobsters'
-            ref = parse.path.split('/')[2]
-        elif 'reddit.com' in parse.hostname and 'comments' in url:
-            source = 'reddit'
-            ref = parse.path.split('/')[4]
-        elif settings.HOSTNAME in parse.hostname:
+
+        if settings.HOSTNAME in parse.hostname:
             raise Exception('Invalid URL')
-        else:
+
+        source, ref, urlref = feed.get_source_ref(url, parse)
+
+        if not source or not ref:
             source = 'manual'
             ref = url
+            urlref = url
 
         existing = database.get_story_by_ref(ref)
         if existing:
@@ -93,8 +85,10 @@ def submit():
             return {'nid': existing.sid}
         else:
             story = dict(id=nid, ref=ref, source=source)
-            valid = feed.update_story(story, is_manual=True)
+            valid = feed.update_story(story, is_manual=True, urlref=urlref)
             if valid:
+                if source is not "manual":
+                    database.put_ref(ref, nid, source, urlref)
                 database.put_story(story)
                 search.put_story(story)
                 return {'nid': nid}