Browse Source

chrome extension support & known sites can set UA.

Jason Schwarzenberger 1 year ago
parent
commit
7e093efa2a
3 changed files with 33 additions and 2 deletions
  1. 1 0
      .gitignore
  2. 23 2
      utils/declutter.js
  3. 9 0
      utils/sites.js

+ 1 - 0
.gitignore

@@ -1,4 +1,5 @@
 # Ignore Mac system files
 node_modules
+extensions
 .DS_store
 *.zip

+ 23 - 2
utils/declutter.js

@@ -1,6 +1,9 @@
 const { JSDOM } = require('jsdom');
 const puppeteer = require('puppeteer');
 const Readability = require('readability');
+const path = require('path');
+const fs = require('fs');
+const util = require('util');
 
 const domToNode = require('./dom-node');
 const telegraph = require('./telegraph');
@@ -125,14 +128,32 @@ const buildReadableContent = async (tab, url) => {
   return { content, title: article.title };
 };
 
+const getExtensions = async () => {
+  const extensionDir = path.join(__dirname, '../extensions');
+  const exists = await util.promisify(fs.exists)(extensionDir);
+  if (!exists) {
+    return [];
+  }
+  const ls = await util.promisify(fs.readdir)(extensionDir);
+  const extensions = ls.map(name => '--load-extension=' + path.join(extensionDir, name));
+  return extensions;
+};
+
+const UA = site => {
+  if (!site || !site.userAgent) {
+    return 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36';
+  }
+  return site.userAgent;
+};
+
 module.exports = async url => {
-  const browser = await puppeteer.launch();
+  const browser = await puppeteer.launch({ headless: true, args: [].concat(await getExtensions()) });
   const tab = await browser.newPage();
   try {
     const site = sites.find(s => s.host.test(url));
 
     await tab.setViewport({ width: 2000, height: 10000 });
-    await tab.setUserAgent('Twitterbot/1.0');
+    await tab.setUserAgent(UA(site));
     await tab.goto(url, {
       timeout: site ? site.timeout : 60000,
       waitUntil: site ? site.waitUntil : 'networkidle0'

+ 9 - 0
utils/sites.js

@@ -1,5 +1,6 @@
 module.exports = [
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?newshub\.co\.nz/,
     publisher: 'Newshub',
     premium: '',
@@ -15,6 +16,7 @@ module.exports = [
     waitUntil: 'domcontentloaded'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?newsroom\.co\.nz/,
     publisher: 'Newsroom',
     premium: '',
@@ -30,6 +32,7 @@ module.exports = [
     waitUntil: 'domcontentloaded'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?noted\.co\.nz/,
     publisher: 'Noted',
     premium: '',
@@ -45,6 +48,7 @@ module.exports = [
     waitUntil: 'domcontentloaded'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?nzherald\.co\.nz/,
     publisher: 'NZ Herald',
     premium: ' Premium',
@@ -60,6 +64,7 @@ module.exports = [
     waitUntil: 'networkidle0'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?r(adio)?nz\.co\.nz/,
     publisher: 'Radio NZ',
     premium: '',
@@ -75,6 +80,7 @@ module.exports = [
     waitUntil: 'domcontentloaded'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?stuff\.co\.nz/,
     publisher: 'Stuff',
     premium: '',
@@ -96,6 +102,7 @@ module.exports = [
     waitUntil: 'networkidle0'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?thespinoff\.co\.nz/,
     publisher: 'The Spinoff',
     premium: '',
@@ -111,6 +118,7 @@ module.exports = [
     waitUntil: 'domcontentloaded'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?odt\.co\.nz/,
     publisher: 'Otago Daily Times',
     premium: '',
@@ -136,6 +144,7 @@ module.exports = [
     waitUntil: 'domcontentloaded'
   },
   {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
     host: /(www\.)?theguardian\.com/,
     publisher: 'The Guardian',
     premium: '',