Browse Source

add stuff, rnz and non-loopy nzherald.

Jason Schwarzenberger 11 months ago
parent
commit
ec5dc1708b
4 changed files with 80 additions and 32 deletions
  1. 1 1
      index.html
  2. 34 27
      utils/bypass.js
  3. 4 4
      utils/dom-node.js
  4. 41 0
      utils/sites.js

+ 1 - 1
index.html

@@ -48,7 +48,7 @@
 	<h1>NZH Outline</h1>
 	<h2>Remove the clutter</h2>
 	<form action="/" method="POST" accept-charset="UTF-8">
-		<input name="url" pattern="https:\/\/(www\.)?nzherald\.co\.nz/.*" placeholder="Enter article link" />
+		<input name="url" placeholder="Enter article link" />
 		<button value="true" name="redirect" type="submit">Outline</button>
 	</form>
 	<p>

+ 34 - 27
utils/bypass.js

@@ -1,21 +1,8 @@
 const { JSDOM } = require('jsdom');
 
-const { parse } = require('./ld');
 const telegraph = require('./telegraph');
 const domToNode = require('./dom-node');
-
-const findArticleSchema = document => {
-  return Array.from(document.querySelectorAll('script[type="application/ld+json"]'))
-    .map(e => parse(e.textContent))
-    .find(l => {
-      if (!l) {
-        return false;
-      }
-      const isSchema = l['@context'] === 'http://schema.org';
-      const isArticle = l['@type'] === 'Article';
-      return isSchema && isArticle;
-    });
-};
+const sites = require('./sites');
 
 const cleanHtmlText = text => {
   const s = new JSDOM('').window.document.createElement('span');
@@ -23,10 +10,28 @@ const cleanHtmlText = text => {
   return s.textContent;
 };
 
-const buildContent = (document, url) => {
-  const node = document.querySelector('.full-content,.premium-content').cloneNode({ deep: true });
-  const bad = node.querySelectorAll('meta,script,style,.related-header,.related-articles-container,.ad-container');
-  [].forEach.call(bad, b => b.remove());
+const getText = (document, selector) => {
+  const node = selector.split(',').reduce((p, s) => p || document.querySelector(s), null);
+  return node ? node.textContent.trim() : '';
+};
+
+const getPublisherName = (document, site) => {
+  const publisher = getText(document, site.selectors.publisher) || site.publisher;
+  if (publisher !== site.publisher) {
+    return `${publisher} via ${site.publisher}`;
+  }
+  return publisher;
+};
+
+const getPremiumTag = (document, site) => {
+  return site.selectors.premium && !!document.querySelector(site.selectors.premium) ? site.premium : '';
+};
+
+const buildContent = (document, site, url) => {
+  const node = document.querySelector(site.selectors.content).cloneNode({ deep: true });
+  Array.from(node.querySelectorAll(site.selectors.bad))
+    .concat(Array.from(node.querySelectorAll('meta,script,style')))
+    .forEach(b => b.remove());
 
   const source = document.createElement('p');
   source.innerHTML = `<a href="${url}"">${url}</a>.`;
@@ -39,20 +44,22 @@ module.exports = async url => {
   const DOM = await JSDOM.fromURL(url);
   const document = DOM.window.document;
 
-  const ld = findArticleSchema(document);
-
-  const content = buildContent(document, url);
-  const title = cleanHtmlText(ld.headline);
-  let premium = '';
-  if (ld.hasPart && ld.hasPart.isAccessibleForFree) {
-    premium = ' Premium';
+  const site = sites.find(s => s.host.test(url));
+  if (!site) {
+    throw new Error('unknown website');
   }
-  const authorName = cleanHtmlText(`${ld.author.name} &bull; ${ld.publisher.name}${premium}`);
+
+  const content = buildContent(document, site, url);
+  const title = getText(document, site.selectors.title);
+  const author = getText(document, site.selectors.authorName);
+  const publisher = getPublisherName(document, site);
+  const premium = getPremiumTag(document, site) || '';
+  const authorName = cleanHtmlText([author, publisher + premium].filter(s => !!s.trim()).join(' &bull; '));
 
   const account = await telegraph.createAccount({
     author_name: authorName,
     author_url: url,
-    short_name: ld.author.name
+    short_name: author || site.publisher
   });
   const page = await telegraph.createPage(title, content, account);
 

+ 4 - 4
utils/dom-node.js

@@ -9,10 +9,10 @@ module.exports = function domToNode(domNode) {
   nodeElement.tag = domNode.tagName.toLowerCase();
   for (var i = 0; i < domNode.attributes.length; i++) {
     var attr = domNode.attributes[i];
-    if (attr.name == 'href' || attr.name == 'src') {
-      if (!nodeElement.attrs) {
-        nodeElement.attrs = {};
-      }
+    if (!nodeElement.attrs) {
+      nodeElement.attrs = {};
+    }
+    if ('href src data-srcset alt srcset'.includes(attr.name)) {
       nodeElement.attrs[attr.name] = attr.value;
     }
   }

+ 41 - 0
utils/sites.js

@@ -0,0 +1,41 @@
+module.exports = [
+  {
+    host: /(www\.)?nzherald\.co\.nz/,
+    publisher: 'NZ Herald',
+    premium: ' Premium',
+    selectors: {
+      authorName: '.author-title,.author span',
+      content: '.full-content,.premium-content',
+      title: 'h1',
+      publisher: '.syndicator-name',
+      premium: '.premium-content',
+      bad: '.related-header,.related-articles-container,.ad-container'
+    }
+  },
+  {
+    host: /(www\.)?stuff\.co\.nz/,
+    publisher: 'Stuff',
+    premium: '',
+    selectors: {
+      authorName: '.sics-component__byline__author',
+      content: '.sics-component__story',
+      title: 'h1',
+      publisher: '.sics-component__story__source',
+      premium: '',
+      bad: '.sics-component__story__source,.sics-component__sharebar'
+    }
+  },
+  {
+    host: /(www\.)?r(adio)?nz\.co\.nz/,
+    publisher: 'Radio NZ',
+    premium: '',
+    selectors: {
+      authorName: '.author-name',
+      content: '.article__body',
+      title: 'h1',
+      publisher: '.prog-name',
+      premium: '',
+      bad: ''
+    }
+  }
+];