Browse Source

close puppeteer properly, add readability.js for unknown sites.

Jason Schwarzenberger 11 months ago
parent
commit
1f4fb2216c
8 changed files with 118 additions and 30 deletions
  1. 3 0
      README.md
  2. 2 0
      index.html
  3. 1 1
      index.js
  4. 4 0
      package-lock.json
  5. 2 1
      package.json
  6. 4 1
      utils/bypass.js
  7. 77 27
      utils/declutter.js
  8. 25 0
      utils/sites.js

+ 3 - 0
README.md

@@ -10,10 +10,13 @@ https://declutter.1j.nz
 - Newsroom
 - Noted
 - NZ Herald
+- Otago Daily Times
 - Radio NZ
 - Stuff
 - The Spinoff
 
+Others: fallback provided using [Readability.js](https://github.com/mozilla/readability).
+
 ## License
 
 MIT License

+ 2 - 0
index.html

@@ -46,6 +46,7 @@
 			font-size: 1.25rem;
 			line-height: 1.5;
 			border: none;
+			border-radius: 0;
 			background: #fff;
 			vertical-align: middle;
 		}
@@ -63,6 +64,7 @@
 			line-height: 1.5;
 			border: none;
 			border-left: solid 1px #aaa;
+			border-radius: 0;
 			background: #f1f1f1;
 			vertical-align: middle;
 		}

+ 1 - 1
index.js

@@ -29,7 +29,7 @@ const declutterRequest = async (res, url, redirect) => {
       case 'Unsupported website':
         return res.sendStatus(400);
     }
-    console.error(e.message);
+    console.error(e);
     return res.sendStatus(500);
   }
 };

+ 4 - 0
package-lock.json

@@ -936,6 +936,10 @@
         "unpipe": "1.0.0"
       }
     },
+    "readability": {
+      "version": "git+https://github.com/mozilla/readability.git#2982216913af2c66b0690e88606b03116553ad92",
+      "from": "git+https://github.com/mozilla/readability.git"
+    },
     "readable-stream": {
       "version": "2.3.6",
       "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",

+ 2 - 1
package.json

@@ -13,6 +13,7 @@
     "express": "^4.17.1",
     "jsdom": "^15.1.1",
     "node-fetch": "^2.6.0",
-    "puppeteer": "^1.19.0"
+    "puppeteer": "^1.19.0",
+    "readability": "git+https://github.com/mozilla/readability.git"
   }
 }

+ 4 - 1
utils/bypass.js

@@ -1,4 +1,7 @@
-module.exports = async tab => {
+module.exports = async (tab, url) => {
+  if (!/^https?:\/\/(www.)?nzherald\.co\.nz/.test(url)) {
+    return;
+  }
   await tab.evaluate(() => {
     (function() {
       const loopParse = (json, iterations) => {

+ 77 - 27
utils/declutter.js

@@ -1,6 +1,8 @@
 const { JSDOM } = require('jsdom');
 const puppeteer = require('puppeteer');
+const Readability = require('readability');
 
+const domToNode = require('./dom-node');
 const telegraph = require('./telegraph');
 const sites = require('./sites');
 const bypass = require('./bypass');
@@ -57,21 +59,6 @@ const buildContent = async (tab, site, url) => {
       source.innerHTML = `<a href="${url}"">${url}</a>.`;
       node.prepend(source);
 
-      const host = url
-        .split('/')
-        .slice(0, 3)
-        .join('/');
-      Array.from(node.querySelectorAll('[src^="/"]'))
-        .filter(e => /^\/[^\/]/.test(e.attributes.src.value))
-        .forEach(e => {
-          e.attributes.src.value = `${host}${e.attributes.src.value}`;
-        });
-      Array.from(node.querySelectorAll('[href^="/"]'))
-        .filter(e => /^\/[^\/]/.test(e.attributes.href.value))
-        .forEach(e => {
-          e.attributes.href.value = `${host}${e.attributes.href.value}`;
-        });
-
       function domToNode(domNode) {
         if (domNode.nodeType == domNode.TEXT_NODE) {
           return domNode.data;
@@ -106,32 +93,95 @@ const buildContent = async (tab, site, url) => {
   );
 };
 
+const fixRelativeLinks = async (tab, url) => {
+  return await tab.evaluate(url => {
+    const host = url
+      .split('/')
+      .slice(0, 3)
+      .join('/');
+
+    Array.from(document.querySelectorAll('[src^="/"]'))
+      .filter(e => e.attributes.src && /^\/[^\/]/.test(e.attributes.src.value))
+      .forEach(e => {
+        e.attributes.src.value = `${host}${e.attributes.src.value}`;
+      });
+    Array.from(document.querySelectorAll('[href^="/"]'))
+      .filter(e => e.attributes.href && /^\/[^\/]/.test(e.attributes.href.value))
+      .forEach(e => {
+        e.attributes.href.value = `${host}${e.attributes.href.value}`;
+      });
+  }, url);
+};
+
+const buildReadableContent = async (tab, url) => {
+  const document = new JSDOM(await tab.content(), { url }).window.document;
+  const reader = new Readability(document);
+  const dom = new JSDOM(`<html><body>${reader.parse().content}</body></html>`);
+  const div = dom.window.document.querySelector('div');
+  const source = dom.window.document.createElement('p');
+  source.innerHTML = `<a href="${url}"">${url}</a>.`;
+  div.prepend(source);
+  content = domToNode(div).children.filter(m => !m.trim || m.trim().length > 0);
+  return content;
+};
+
 module.exports = async url => {
   const site = sites.find(s => s.host.test(url));
-  if (!site) {
-    throw new Error('Unsupported website');
-  }
 
   const browser = await puppeteer.launch();
   const tab = await browser.newPage();
   await tab.setUserAgent('Googlebot/2.1 (+http://www.google.com/bot.html)');
   await tab.setViewport({ width: 2000, height: 10000 });
-  await tab.goto(url, { timeout: site.timeout, waitUntil: site.waitUntil });
-  if (site.publisher === 'NZ Herald') {
-    await bypass(tab);
+  await tab.goto(url, {
+    timeout: site ? site.timeout : 30000,
+    waitUntil: site ? site.waitUntil : 'domcontentloaded'
+  });
+  await fixRelativeLinks(tab, url);
+  await bypass(tab, url);
+
+  let premium = '';
+  let content = '';
+  let { title, author, publisher } = await tab.evaluate(url => {
+    const titleSelector = [
+      'meta[property="og:title"]',
+      'meta[property="twitter:title"]',
+      'meta[property="title"]'
+    ].join(',');
+    const $title = document.querySelector(titleSelector);
+    const $publisher = document.querySelector('meta[property="og:site_name"]');
+    return {
+      title: $title && $title.content ? $title.content : '',
+      author: $publisher && $publisher.content ? $publisher.content : '',
+      publisher: new URL(url).host
+    };
+  }, url);
+
+  if (site) {
+    const meta = await Promise.all([
+      buildContent(tab, site, url),
+      getText(tab, site.selectors.title),
+      getAuthorName(tab, site.selectors.authorName),
+      getPublisherName(tab, site),
+      getPremiumTag(tab, site)
+    ]);
+    content = meta[0];
+    title = meta[1];
+    author = meta[2];
+    publisher = meta[3];
+    premium = meta[4] || '';
+  } else {
+    content = await buildReadableContent(tab, url);
   }
 
-  const content = await buildContent(tab, site, url);
-  const title = await getText(tab, site.selectors.title);
-  const author = await getAuthorName(tab, site.selectors.authorName);
-  const publisher = await getPublisherName(tab, site);
-  const premium = (await getPremiumTag(tab, site)) || '';
   const authorName = cleanHtmlText([author, publisher + premium].filter(s => !!s.trim()).join(' &bull; '));
 
+  await tab.close();
+  await browser.close();
+
   const account = await telegraph.createAccount({
     author_name: authorName,
     author_url: url,
-    short_name: author || site.publisher
+    short_name: (author || publisher || authorName).substring(0, 31)
   });
   const page = await telegraph.createPage(title, content, account);
 

+ 25 - 0
utils/sites.js

@@ -109,5 +109,30 @@ module.exports = [
     },
     timeout: 30000,
     waitUntil: 'domcontentloaded'
+  },
+  {
+    host: /(www\.)?odt\.co\.nz/,
+    publisher: 'Otago Daily Times',
+    premium: '',
+    selectors: {
+      authorName: '.byline a,.byline',
+      content: 'article.node-story ',
+      title: 'h1.page-header',
+      publisher: '',
+      premium: '',
+      bad: [
+        '.pane-odt-promotion-promoted-items',
+        '.breadcrumb-wrapper',
+        '.share-icons-header',
+        '.sharethis-buttons',
+        '.byline,header',
+        'footer',
+        '#related-stories',
+        '.comment-count-footer',
+        '.comment-wrapper'
+      ].join(',')
+    },
+    timeout: 30000,
+    waitUntil: 'domcontentloaded'
   }
 ];