Browse Source

major rewrite.

Jason Schwarzenberger 10 months ago
parent
commit
2f8ccf2d22

+ 2 - 2
.gitmodules

@@ -1,3 +1,3 @@
-[submodule "bypass-paywalls-chrome"]
-	path = bypass-paywalls-chrome
+[submodule "vendor/bypass-paywalls-chrome"]
+	path = vendor/bypass-paywalls-chrome
 	url = https://github.com/iamadamdev/bypass-paywalls-chrome.git

+ 26 - 139
index.js

@@ -1,151 +1,38 @@
 const express = require("express");
 const bodyParser = require("body-parser");
-const path = require("path");
-const NodeCache = require("node-cache");
 
-const { declutter, telegraph } = require("./utils/declutter");
-const { disqus } = require("./utils/disqus");
-
-const MINUTE = 60;
-const HOUR = 60 * MINUTE;
+const { telegraph } = require('./utils/cache');
+const headless = require('./routes/headless');
+const simple = require('./routes/simple');
 
 const port = process.env.NODE_PORT || 3000;
 const app = express();
-const cache = new NodeCache({ stdTTL: 24 * HOUR });
-const declutter_cache = new NodeCache({ stdTTL: 30 * MINUTE });
-const comment_cache = new NodeCache({ stdTTL: 30 * MINUTE });
 
 app.use(bodyParser.json());
 app.use(bodyParser.urlencoded({ extended: true }));
-
-app.get("/favicon.ico", async (req, res) => res.sendStatus(404));
-app.get("/loading.svg", async (req, res) =>
-  res.sendFile(path.join(__dirname, "/public/loading.svg"))
-);
-app.get("/style.css", async (req, res) =>
-  res.sendFile(path.join(__dirname, "/public/style.css"))
-);
-app.get("/script.js", async (req, res) =>
-  res.sendFile(path.join(__dirname, "/public/script.js"))
-);
-app.get("/keys.js", async (req, res) =>
-  res.sendFile(path.join(__dirname, "/public/keys.js"))
-);
-app.get("/", async (req, res) =>
-  res.sendFile(path.join(__dirname, "/public/index.html"))
-);
-
-app.get("/keys.json", async (req, res) => {
-  const pages = cache.keys().map((key) => cache.get(key));
-  return res.send(pages);
-});
-
-app.post("/", async (req, res) => {
-  const url = req.body.url;
-  const redirect = !!req.body.redirect;
-  const nocache = !!req.body.nocache;
-  try {
-    if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
-      return res.status(400);
-    }
-    let page = cache.get(url);
-    let readable = declutter_cache.get(url);
-    if (nocache) {
-      page = undefined;
-      readable = undefined;
-      console.log('[simple] no cache');
-    }
-    console.log('[simple] have cached page', !!page);
-    if (!page) {
-      console.log('[simple] have cached readable', !!readable);
-      if (!readable) {
-        console.log('[simple] doing a declutter');
-        readable = await declutter(url);
-        declutter_cache.set(url, readable);
-        console.log('[simple] have decluttered readable', !!readable);
-      }
-      console.log('[simple] doing a page');
-      page = await telegraph(url, readable);
-      console.log('[simple] have created page', !!page);
-      if (page) {
-        cache.set(url, {
-          author: page.author,
-          author_url: page.author_url,
-          description: page.description,
-          title: page.title,
-          url: page.url,
-        });
-      }
-    }
-    if (!page) {
-      return res.status(500);
-    }
-    if (redirect) {
-      console.log('[simple] sent page redirect');
-      return res.redirect(page.url);
-    }
-    console.log('[simple] sent page url');
-    return res.send(page.url);
-  } catch (e) {
-    if (/timeout/i.test(e.message)) {
-      return res.status(504);
-    }
-    return res.status(500);
-  }
-});
-
-app.post("/details", async (req, res) => {
-  const url = req.body.url;
-  const nocache = !!req.body.nocache;
-  if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
-    return res.status(400);
-  }
-  let page = cache.get(url);
-  let readable = declutter_cache.get(url);
-  if (nocache) {
-    page = undefined;
-    readable = undefined;
-    console.log('[details] no cache');
-  }
-  console.log('[details] have cached readable', !!readable);
-  if (!readable) {
-    console.log('[details] doing a declutter');
-    readable = await declutter(url);
-    declutter_cache.set(url, readable);
-    console.log('[details] have decluttered readable', !!readable);
-  }
-  if (!readable) {
-    return res.status(500);
-  }
-  console.log('[details] sent readable');
-  return res.send(readable);
-});
-
-
-
-app.post("/comments", async (req, res) => {
-  const url = req.body.url;
-  const nocache = !!req.body.nocache;
-  if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
-    return res.status(400);
-  }
-  let comments = comment_cache.get(url);
-  if (nocache) {
-    comments = undefined;
-    console.log('[comments] no cache');
-  }
-  console.log('[comments] have cached comments', !!comments);
-  if (!comments) {
-    console.log('[comments] doing a disqus');
-    comments = await disqus(url);
-    comment_cache.set(url, comments);
-    console.log('[comments] have disqus comments', !!comments);
-  }
-  if (!comments) {
-    return res.status(500);
-  }
-  console.log('[comments] sent comments');
-  return res.send(comments);
+app.use(express.static('public'));
+
+app.get("/recent.json", async (_, res) => res.send(telegraph.keys().map((key) => telegraph.get(key))));
+
+app.use('/', headless.router());
+app.use('/headless', headless.router());
+app.use('/simple', simple.router());
+
+
+app.get('/_form', (_, res) => {
+	const prefixes = ['', '/headless', '/simple'];
+	const routes = ['/', '/telegraph', '/content', '/details', '/comments'];
+	const html = prefixes.flatMap(p => routes.map(r => p + r))
+		.map(route => `
+	<form method="POST" action="${route}" accept-charset="UTF-8">
+		<fieldset>
+			<legend>route: POST ${route}</legend>
+			<p><input name="url"/></p>
+			<p><input type="checkbox" name="redirect" /> redirect?</p>
+			<p><button type="submit">SUBMIT</button></p>
+		</fieldset>
+	</form>`).join('<hr />');
+	res.send(html);
 });
 
 app.listen(port, () => console.log(`Declutter app listening on port ${port}!`));

utils/telegraph.js → lib/telegraph.js


+ 1 - 2
package.json

@@ -13,9 +13,8 @@
     "body-parser": "^1.19.0",
     "express": "^4.17.1",
     "jsdom": "^15.1.1",
-    "json5": "^2.1.3",
     "node-cache": "^5.1.2",
     "node-fetch": "^2.6.1",
     "playwright": "^1.5.1"
   }
-}
+}

+ 1 - 1
public/keys.js

@@ -1,6 +1,6 @@
 (function () {
   function fetchLinks() {
-    return fetch("/keys.json")
+    return fetch("/recent.json")
       .then((response) => {
         if (response.ok) {
           return response.json();

+ 31 - 0
routes/headless/get-comments.js

@@ -0,0 +1,31 @@
+const scraper = require("../../scraper/headless");
+const cache = require('../../utils/cache');
+
+module.exports = {
+	getComments
+};
+
+async function getComments(req, res) {
+	const url = req.body.url;
+	const nocache = !!req.body.nocache;
+	if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+		return res.status(400);
+	}
+	let comments = cache.comment.get(url);
+	if (nocache) {
+		comments = undefined;
+		console.log('[headless/comments] no cache');
+	}
+	console.log('[headless/comments] have cached comments', !!comments);
+	if (!comments) {
+		console.log('[headless/comments] doing a disqus');
+		comments = await scraper.getComments(url);
+		cache.comment.set(url, comments);
+		console.log('[headless/comments] have disqus comments', !!comments);
+	}
+	if (!comments) {
+		return res.status(500);
+	}
+	console.log('[headless/comments] sent comments');
+	return res.send(comments);
+}

+ 34 - 0
routes/headless/get-content.js

@@ -0,0 +1,34 @@
+const scraper = require("../../scraper/headless");
+const cache = require('../../utils/cache');
+
+module.exports = {
+	getContent,
+};
+
+async function getContent(req, res) {
+	const url = req.body.url;
+	const nocache = !!req.body.nocache;
+	if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+		return res.status(400);
+	}
+	console.log(`[headless/html] for url ${url}`);
+	let page = cache.telegraph.get(url);
+	let readable = cache.declutter.get(url);
+	if (nocache) {
+		page = undefined;
+		readable = undefined;
+		console.log('[headless/html] no cache');
+	}
+	console.log('[headless/html] have cached readable', !!readable);
+	if (!readable) {
+		console.log('[headless/html] doing a declutter');
+		readable = await scraper.getDetails(url);
+		cache.declutter.set(url, readable);
+		console.log('[headless/html] have decluttered readable', !!readable);
+	}
+	if (!readable || !readable.content) {
+		return res.status(500);
+	}
+	console.log('[headless/html] sent readable markup');
+	return res.send(readable.content);
+}

+ 33 - 0
routes/headless/get-details.js

@@ -0,0 +1,33 @@
+const scraper = require("../../scraper/headless");
+const cache = require('../../utils/cache');
+
+module.exports = {
+	getDetails,
+};
+
+async function getDetails(req, res) {
+	const url = req.body.url;
+	const nocache = !!req.body.nocache;
+	if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+		return res.status(400);
+	}
+	let page = cache.telegraph.get(url);
+	let readable = cache.declutter.get(url);
+	if (nocache) {
+		page = undefined;
+		readable = undefined;
+		console.log('[headless/details] no cache');
+	}
+	console.log('[headless/details] have cached readable', !!readable);
+	if (!readable) {
+		console.log('[headless/details] doing a declutter');
+		readable = await scraper.getDetails(url);
+		cache.declutter.set(url, readable);
+		console.log('[headless/details] have decluttered readable', !!readable);
+	}
+	if (!readable) {
+		return res.status(500);
+	}
+	console.log('[headless/details] sent readable');
+	return res.send(readable);
+}

+ 61 - 0
routes/headless/get-telegraph-link.js

@@ -0,0 +1,61 @@
+const scraper = require("../../scraper/headless");
+const { publishReadable } = require("../../utils/publish-telegraph");
+const cache = require('../../utils/cache');
+
+module.exports = {
+	getTelegraphLink,
+};
+
+async function getTelegraphLink(req, res) {
+	const url = req.body.url;
+	const redirect = !!req.body.redirect;
+	const nocache = !!req.body.nocache;
+	try {
+		if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+			return res.status(400);
+		}
+		let page = cache.telegraph.get(url);
+		let readable = cache.declutter.get(url);
+		if (nocache) {
+			page = undefined;
+			readable = undefined;
+			console.log('[headless/telegraph] no cache');
+		}
+		console.log('[headless/telegraph] have cached page', !!page);
+		if (!page) {
+			console.log('[headless/telegraph] have cached readable', !!readable);
+			if (!readable) {
+				console.log('[headless/telegraph] doing a declutter');
+				readable = await scraper.getDetails(url);
+				cache.declutter.set(url, readable);
+				console.log('[headless/telegraph] have decluttered readable', !!readable);
+			}
+			console.log('[headless/telegraph] doing a page');
+			page = await publishReadable(url, readable);
+			console.log('[headless/telegraph] have created page', !!page);
+			if (page) {
+				cache.telegraph.set(url, {
+					author: page.author,
+					author_url: page.author_url,
+					description: page.description,
+					title: page.title,
+					url: page.url,
+				});
+			}
+		}
+		if (!page) {
+			return res.status(500);
+		}
+		if (redirect) {
+			console.log('[headless/telegraph] sent page redirect');
+			return res.redirect(page.url);
+		}
+		console.log('[headless/telegraph] sent page url');
+		return res.send(page.url);
+	} catch (e) {
+		if (/timeout/i.test(e.message)) {
+			return res.status(504);
+		}
+		return res.status(500);
+	}
+}

+ 24 - 0
routes/headless/index.js

@@ -0,0 +1,24 @@
+var express = require('express')
+
+const { getTelegraphLink } = require('./get-telegraph-link');
+const { getComments } = require('./get-comments');
+const { getContent } = require('./get-content');
+const { getDetails } = require('./get-details');
+
+module.exports = {
+	getComments,
+	getContent,
+	getDetails,
+	getTelegraphLink,
+	router
+};
+
+function router() {
+	var router = express.Router()
+	router.post("/", getTelegraphLink);
+	router.post("/telegraph", getTelegraphLink);
+	router.post("/content", getContent);
+	router.post("/details", getDetails);
+	router.post("/comments", getComments);
+	return router;
+};

+ 20 - 0
routes/simple/get-content.js

@@ -0,0 +1,20 @@
+const scraper = require("../../scraper/simple");
+
+module.exports = {
+	getContent,
+};
+
+async function getContent(req, res) {
+	const url = req.body.url;
+	if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+		return res.status(400);
+	}
+	console.log('[simple/html] doing a declutter');
+	const readable = await scraper.getDetails(url);
+	console.log('[simple/html] have decluttered readable', !!readable);
+	if (!readable || !readable.content) {
+		return res.status(500);
+	}
+	console.log('[simple/html] sent readable markup');
+	return res.send(readable.content);
+}

+ 20 - 0
routes/simple/get-details.js

@@ -0,0 +1,20 @@
+const scraper = require("../../scraper/simple");
+
+module.exports = {
+	getDetails,
+};
+
+async function getDetails(req, res) {
+	const url = req.body.url;
+	if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+		return res.status(400);
+	}
+	console.log('[simple/details] doing a declutter');
+	const readable = await scraper.getDetails(url);
+	console.log('[simple/details] have decluttered readable', !!readable);
+	if (!readable) {
+		return res.status(500);
+	}
+	console.log('[simple/details] sent readable');
+	return res.send(readable);
+}

+ 36 - 0
routes/simple/get-telegraph-link.js

@@ -0,0 +1,36 @@
+const scraper = require("../../scraper/simple");
+const { publishReadable } = require("../../utils/publish-telegraph");
+
+module.exports = {
+	getTelegraphLink,
+};
+
+async function getTelegraphLink(req, res) {
+	const url = req.body.url;
+	const redirect = !!req.body.redirect;
+	try {
+		if (!/https?:\/\/(www\.)?.*\/.*/i.test(url)) {
+			return res.status(400);
+		}
+		console.log('[simple/telegraph] doing a declutter');
+		const readable = await scraper.getDetails(url);
+		console.log('[simple/telegraph] have decluttered readable', !!readable);
+		console.log('[simple/telegraph] doing a page');
+		const page = await publishReadable(url, readable);
+		console.log('[simple/telegraph] have created page', !!page);
+		if (!page) {
+			return res.status(500);
+		}
+		if (redirect) {
+			console.log('[simple/telegraph] sent page redirect');
+			return res.redirect(page.url);
+		}
+		console.log('[simple/telegraph] sent page url');
+		return res.send(page.url);
+	} catch (e) {
+		if (/timeout/i.test(e.message)) {
+			return res.status(504);
+		}
+		return res.status(500);
+	}
+}

+ 21 - 0
routes/simple/index.js

@@ -0,0 +1,21 @@
+var express = require('express')
+
+const { getTelegraphLink } = require('./get-telegraph-link');
+const { getContent } = require('./get-content');
+const { getDetails } = require('./get-details');
+
+module.exports = {
+	getContent,
+	getDetails,
+	getTelegraphLink,
+	router
+};
+
+function router() {
+	var router = express.Router()
+	router.post("/", getTelegraphLink);
+	router.post("/telegraph", getTelegraphLink);
+	router.post("/content", getContent);
+	router.post("/details", getDetails);
+	return router;
+};

+ 3 - 3
utils/disqus.js

@@ -1,11 +1,11 @@
 const { JSDOM } = require("jsdom");
 const { firefox } = require("playwright");
-const { getUserAgent } = require('./user-agent');
-const { disqusThread } = require('./disqus-thread');
+const { getUserAgent } = require('../../utils/user-agent');
+const { disqusThread } = require('../../utils/disqus-thread');
 
 const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
 
-module.exports.disqus = async (url) => {
+module.exports.getComments = async (url) => {
 	const { userAgent, headers } = getUserAgent(url);
 
 	const browser = await firefox.launch({ args: [], headless: true });

+ 50 - 0
scraper/headless/get-details.js

@@ -0,0 +1,50 @@
+const { firefox } = require("playwright");
+const { JSDOM } = require("jsdom");
+const { Readability } = require("@mozilla/readability");
+
+const { getUserAgent } = require('../../utils/user-agent');
+const { blockedRegexes, matchUrlDomain } = require("../../utils/sites");
+const { addMetadata, extractMetadata } = require('../../utils/extract-metadata');
+
+module.exports.getDetails = async (url) => {
+	const { userAgent, headers } = getUserAgent(url);
+
+	const browser = await firefox.launch({ args: [], headless: true });
+	const tab = await browser.newPage({
+		extraHTTPHeaders: headers,
+		userAgent,
+		viewport: { width: 2000, height: 10000 },
+	});
+
+	try {
+		await tab.route(/.*/, (route) => {
+			const routeUrl = route.request().url();
+			const blockedDomains = Object.keys(blockedRegexes);
+			const domain = matchUrlDomain(blockedDomains, routeUrl);
+			if (domain && routeUrl.match(blockedRegexes[domain])) {
+				return route.abort();
+			}
+			return route.continue();
+		});
+		await tab.addInitScript({ path: "vendor/bypass-paywalls-chrome/src/js/contentScript.js" });
+		await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filter.js" });
+		await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
+		await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" });
+		await tab.waitForTimeout(2000);
+
+		const html = await tab.content();
+		const doc = new JSDOM(html, { url });
+		const reader = new Readability(doc.window.document);
+		let readable = reader.parse();
+		let { author, publisher, authorType } = extractMetadata(html, url)
+
+		readable = addMetadata(readable, authorType, author, publisher, url);
+
+		return readable;
+	} catch (e) {
+		throw e;
+	} finally {
+		await tab.close();
+		await browser.close();
+	}
+};

+ 7 - 0
scraper/headless/index.js

@@ -0,0 +1,7 @@
+const { getDetails } = require('./get-details');
+const { getComments } = require('./get-comments');
+
+module.exports = {
+	getDetails,
+	getComments
+};

scripts/cosmeticFilter.js → scraper/headless/scripts/cosmetic-filter.js


scripts/fix-relative-links.js → scraper/headless/scripts/fix-relative-links.js


+ 32 - 0
scraper/simple/get-details.js

@@ -0,0 +1,32 @@
+const fetch = require('node-fetch');
+const { JSDOM } = require("jsdom");
+const { Readability } = require("@mozilla/readability");
+
+const { getUserAgent } = require('../../utils/user-agent');
+const { addMetadata, extractMetadata } = require('../../utils/extract-metadata');
+
+module.exports.getDetails = async (url) => {
+	try {
+		const { userAgent, headers } = getUserAgent(url);
+		const response = await fetch(url, {
+			headers: {
+				...headers,
+				'User-Agent': userAgent
+			}
+		});
+		if (!response.ok) {
+			return res.sendStatus(response.statusCode);
+		}
+		const html = await response.text();
+		const doc = new JSDOM(html, { url });
+		const reader = new Readability(doc.window.document);
+		let readable = reader.parse();
+		let { author, publisher, authorType } = extractMetadata(html, url)
+
+		readable = addMetadata(readable, authorType, author, publisher, url);
+
+		return readable;
+	} catch (e) {
+		throw e;
+	}
+};

+ 5 - 0
scraper/simple/index.js

@@ -0,0 +1,5 @@
+const { getDetails } = require('./get-details');
+
+module.exports = {
+	getDetails,
+};

+ 108 - 0
scraper/simple/scripts/cosmetic-filter.js

@@ -0,0 +1,108 @@
+(function () {
+  removeHiddenElements();
+
+  if (matchDomain("stuff.co.nz")) {
+    removeSelectors([
+      ".support-brief-container",
+      '[class*="donation-in-"]',
+      ".sics-component__sharebar",
+      ".breaking-news-pointer",
+      ".bigbyline-container",
+      [
+        ".sics-component__html-injector.sics-component__story__paragraph",
+        "READ MORE:",
+      ],
+    ]);
+  }
+  if (matchDomain("nzherald.co.nz")) {
+    removeSelectors([
+      "[href$='#commenting-widget']",
+      ".related-articles",
+      ".article__print-button",
+      ".share-bar",
+      ".c-suggest-links.read-more-links",
+      ".website-of-year",
+      ".meta-data",
+      ".article__kicker",
+      ".author__image",
+    ]);
+  }
+  if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) {
+    removeSelectors([".c-advert-app", ".c-sub-nav"]);
+  }
+  if (matchDomain(["newsroom.co.nz"])) {
+    removeSelectors([".article_content__section", ".bio"]);
+  }
+  if (matchDomain(["newshub.co.nz"])) {
+    removeSelectors([
+      ".c-ArticleHeading-authorPicture",
+      ".relatedarticles",
+      ".ArticleAttribution",
+      '.GlobalFooter'
+    ]);
+  }
+  if (matchDomain(["tvnz.co.nz"])) {
+    removeSelectors([".signup-container container"]);
+  }
+  if (matchDomain(["thespinoff.co.nz"])) {
+    removeSelectors([
+      ".the-spinoff-club-interruptive",
+      ".bulletin-signup",
+      ".sponsor_post_footer"
+    ]);
+  }
+
+  function matchDomain(domains) {
+    const hostname = window.location.hostname;
+    if (typeof domains === "string") {
+      domains = [domains];
+    }
+    return domains.some(
+      (domain) => hostname === domain || hostname.endsWith("." + domain)
+    );
+  }
+
+  function removeDOMElement(...elements) {
+    for (const element of elements) {
+      if (element) {
+        element.remove();
+      }
+    }
+  }
+
+  function pageContains(selector, text) {
+    const elements = document.querySelectorAll(selector);
+    return Array.prototype.filter.call(elements, function (element) {
+      return RegExp(text).test(element.textContent);
+    });
+  }
+
+  function removeHiddenElements() {
+    window.setTimeout(function () {
+      const selector = "*:not(script):not(head):not(meta):not(link):not(style)";
+      Array.from(document.querySelectorAll(selector))
+        .filter((element) => {
+          const computed = getComputedStyle(element);
+          const displayNone = computed["display"] === "none";
+          const visibilityHidden = computed["visibility"] === "hidden";
+          return displayNone || visibilityHidden;
+        })
+        .forEach((element) => element && element.remove());
+    }, 1500);
+  }
+
+  function removeSelectors(selectors) {
+    window.setTimeout(function () {
+      const elements = selectors.flatMap((s) => {
+        if (typeof s === "string") {
+          return Array.from(document.querySelectorAll(s));
+        }
+        if (s && s.constructor.name === "Array") {
+          return pageContains(...s);
+        }
+        return undefined;
+      });
+      removeDOMElement(...elements);
+    }, 500);
+  }
+})();

+ 14 - 0
scraper/simple/scripts/fix-relative-links.js

@@ -0,0 +1,14 @@
+(function () {
+	const { host, protocol } = window.location;
+	const url = `${protocol}//${host}`;
+	[
+		['[src^="/"]', 'src'],
+		['[href^="/"]', 'href']
+	].forEach(([selector, attribute]) => {
+		Array.from(document.querySelectorAll(selector))
+			.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
+			.forEach((e) => {
+				e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
+			});
+	});
+})();

+ 14 - 0
utils/cache.js

@@ -0,0 +1,14 @@
+const NodeCache = require("node-cache");
+
+const MINUTE = 60;
+const HOUR = 60 * MINUTE;
+
+const telegraph = new NodeCache({ stdTTL: 24 * HOUR });
+const declutter = new NodeCache({ stdTTL: 30 * MINUTE });
+const comment = new NodeCache({ stdTTL: 30 * MINUTE });
+
+module.exports = {
+	telegraph,
+	declutter,
+	comment
+};

+ 0 - 79
utils/declutter.js

@@ -1,79 +0,0 @@
-const { JSDOM } = require("jsdom");
-const { firefox } = require("playwright");
-const { Readability } = require("@mozilla/readability");
-
-const { addMetadata, extractMetadata } = require('./extract-metadata');
-const domToNode = require("./dom-node");
-const telegraph = require("./telegraph");
-
-const { getUserAgent } = require('./user-agent');
-const { blockedRegexes, matchUrlDomain } = require("./sites");
-
-
-module.exports.declutter = async (url) => {
-  const { userAgent, headers } = getUserAgent(url);
-
-  const browser = await firefox.launch({ args: [], headless: true });
-  const tab = await browser.newPage({
-    extraHTTPHeaders: headers,
-    userAgent,
-    viewport: { width: 2000, height: 10000 },
-  });
-
-  try {
-    await tab.route(/.*/, (route) => {
-      const routeUrl = route.request().url();
-      const blockedDomains = Object.keys(blockedRegexes);
-      const domain = matchUrlDomain(blockedDomains, routeUrl);
-      if (domain && routeUrl.match(blockedRegexes[domain])) {
-        return route.abort();
-      }
-      return route.continue();
-    });
-    await tab.addInitScript({ path: "bypass-paywalls-chrome/src/js/contentScript.js" });
-    await tab.addInitScript({ path: "scripts/cosmeticFilter.js" });
-    await tab.addInitScript({ path: "scripts/fix-relative-links.js" });
-    await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" });
-    await tab.waitForTimeout(2000);
-
-    let { author, publisher, authorType } = await extractMetadata(tab, url)
-
-    const body = await tab.content();
-    const doc = new JSDOM(body, { url });
-    const reader = new Readability(doc.window.document);
-    let readable = reader.parse();
-
-    readable = addMetadata(readable, authorType, author, publisher, url);
-
-    return readable;
-  } catch (e) {
-    console.error(e);
-    throw e;
-  } finally {
-    await tab.close();
-    await browser.close();
-  }
-};
-
-module.exports.telegraph = async (url, readable) => {
-  const account = await telegraph.createAccount({
-    author_name: readable.author,
-    author_url: url,
-    short_name: (
-      readable.author ||
-      readable.publisher ||
-      readable.byline
-    ).substring(0, 31),
-  });
-
-  const dom = new JSDOM(`<html><body>${readable.content}</body></html>`);
-  const div = dom.window.document.querySelector("body");
-  const source = dom.window.document.createElement("p");
-  source.innerHTML = `<a href="${url}"">${url}</a>`;
-  div.prepend(source);
-  const content = domToNode(div).children.filter((m) => {
-    return !m.trim || m.trim().length > 0;
-  });
-  const page = await telegraph.createPage(readable.title, content, account);
-  return page;
-};

+ 37 - 36
utils/extract-metadata.js

@@ -4,47 +4,48 @@ module.exports.extractMetadata = extractMetadata;
 module.exports.cleanHtmlText = cleanHtmlText;
 module.exports.addMetadata = addMetadata;
 
-async function extractMetadata(tab, url) {
-	return await tab.evaluate((url) => {
-		const meta = {
-			author: "",
-			publisher: new URL(url).host,
-			authorType: "",
-		};
+function extractMetadata(html, url) {
+	const dom = new JSDOM(html, { url });
+	const document = dom.window.document;
 
-		const ogProps = document.querySelector('meta[property="og:site_name"]');
-		const itemProps = document.querySelectorAll('[itemprop="author"]');
-		const ldJsonTags = document.querySelectorAll(
-			'script[type="application/ld+json"]'
-		);
+	const meta = {
+		author: "",
+		publisher: new URL(url).host,
+		authorType: "",
+	};
 
-		if (ogProps) {
-			meta.author = ogProps && ogProps.content ? ogProps.content : "";
-			meta.authorType = "og";
-		}
+	const ogProps = document.querySelector('meta[property="og:site_name"]');
+	const itemProps = document.querySelectorAll('[itemprop="author"]');
+	const ldJsonTags = document.querySelectorAll(
+		'script[type="application/ld+json"]'
+	);
+
+	if (ogProps) {
+		meta.author = ogProps && ogProps.content ? ogProps.content : "";
+		meta.authorType = "og";
+	}
 
-		Array.from(itemProps).forEach((element) => {
-			meta.publisher = meta.author;
-			meta.author = element.innerText;
-			meta.authorType = "ld";
-		});
+	Array.from(itemProps).forEach((element) => {
+		meta.publisher = meta.author;
+		meta.author = element.innerText;
+		meta.authorType = "ld";
+	});
 
-		Array.from(ldJsonTags).forEach((ldTag) => {
-			try {
-				const ld = JSON.parse(ldTag.innerHTML);
-				if (ld["@type"] === "Article") {
-					if (ld.author && ld.author["@type"] === "Person") {
-						meta.author = ld.author.name;
-						meta.authorType = "ld";
-					}
-					if (ld.publisher && ld.publisher["@type"] === "Organization") {
-						meta.publisher = ld.publisher.name;
-					}
+	Array.from(ldJsonTags).forEach((ldTag) => {
+		try {
+			const ld = JSON.parse(ldTag.innerHTML);
+			if (ld["@type"] === "Article") {
+				if (ld.author && ld.author["@type"] === "Person") {
+					meta.author = ld.author.name;
+					meta.authorType = "ld";
+				}
+				if (ld.publisher && ld.publisher["@type"] === "Organization") {
+					meta.publisher = ld.publisher.name;
 				}
-			} catch (e) { }
-		});
-		return meta;
-	}, url);
+			}
+		} catch (e) { }
+	});
+	return meta;
 };
 
 function cleanHtmlText(text) {

+ 28 - 0
utils/publish-telegraph.js

@@ -0,0 +1,28 @@
+const { JSDOM } = require("jsdom");
+const domToNode = require("./dom-node");
+
+const telegraph = require("../lib/telegraph");
+
+
+module.exports.publishReadable = async (url, readable) => {
+  const account = await telegraph.createAccount({
+    author_name: readable.byline,
+    author_url: url,
+    short_name: (
+      readable.author ||
+      readable.publisher ||
+      readable.byline
+    ).substring(0, 31),
+  });
+
+  const dom = new JSDOM(`<html><body>${readable.content}</body></html>`);
+  const div = dom.window.document.querySelector("body");
+  const source = dom.window.document.createElement("p");
+  source.innerHTML = `<a href="${url}"">${url}</a>`;
+  div.prepend(source);
+  const content = domToNode(div).children.filter((m) => {
+    return !m.trim || m.trim().length > 0;
+  });
+  const page = await telegraph.createPage(readable.title, content, account);
+  return page;
+};

bypass-paywalls-chrome → vendor/bypass-paywalls-chrome