Jason Schwarzenberger 3 months ago
parent
commit
0acca853ea

+ 1 - 1
bypass-paywalls-chrome

@@ -1 +1 @@
-Subproject commit fff7f483db947e690977bfc80955a53329d3d349
+Subproject commit fb1b09fccbb64f1d782753cc5c425eb0723e596f

+ 1 - 1
utils/cosmeticFilter.js

@@ -88,7 +88,7 @@
           return displayNone || visibilityHidden;
         })
         .forEach((element) => element && element.remove());
-    }, 500);
+    }, 1500);
   }
 
   function removeSelectors(selectors) {

+ 14 - 0
scripts/fix-relative-links.js

@@ -0,0 +1,14 @@
+(function () {
+	const { host, protocol } = window.location;
+	const url = `${protocol}//${host}`;
+	[
+		['[src^="/"]', 'src'],
+		['[href^="/"]', 'href']
+	].forEach(([selector, attribute]) => {
+		Array.from(document.querySelectorAll(selector))
+			.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
+			.forEach((e) => {
+				e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
+			});
+	});
+})();

+ 11 - 0
utils/constants.js

@@ -0,0 +1,11 @@
+const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
+const googleBotIp = '66.249.66.1';
+
+module.exports.googleBot = {
+	userAgent: googleBotUserAgent,
+	ip: googleBotIp,
+	headers: {
+		'User-Agent': googleBotUserAgent,
+		'X-Forwarded-For': googleBotIp,
+	}
+}

+ 25 - 135
utils/declutter.js

@@ -1,159 +1,49 @@
 const { JSDOM } = require("jsdom");
 const { firefox } = require("playwright");
 const { Readability } = require("@mozilla/readability");
-const path = require("path");
-const fs = require("fs");
-const util = require("util");
 
+const { addMetadata, extractMetadata } = require('./extract-metadata');
 const domToNode = require("./dom-node");
 const telegraph = require("./telegraph");
-const {
-  blockedRegexes,
-  matchUrlDomain,
-  useGoogleBotSites,
-} = require("./sites");
 
-const cleanHtmlText = (text) => {
-  const s = new JSDOM("").window.document.createElement("span");
-  s.innerHTML = text;
-  return s.textContent;
-};
-
-const fixRelativeLinks = async (tab, url) => {
-  return await tab.evaluate((url) => {
-    const host = url.split("/").slice(0, 3).join("/");
-
-    Array.from(document.querySelectorAll('[src^="/"]'))
-      .filter(
-        (e) => e.attributes.src && /^\/[^\/]/.test(e.attributes.src.value)
-      )
-      .forEach((e) => {
-        e.attributes.src.value = `${host}${e.attributes.src.value}`;
-      });
-    Array.from(document.querySelectorAll('[href^="/"]'))
-      .filter(
-        (e) => e.attributes.href && /^\/[^\/]/.test(e.attributes.href.value)
-      )
-      .forEach((e) => {
-        e.attributes.href.value = `${host}${e.attributes.href.value}`;
-      });
-  }, url);
-};
-
-const buildReadableContent = async (tab, url) => {
-  const body = await tab.content();
-  const doc = new JSDOM(body, { url });
-  const reader = new Readability(doc.window.document);
-  const article = reader.parse();
-  if (!article) {
-    return { content: '', title: '', byline: '' };
-  }
+const { getUserAgent } = require('./user-agent');
+const { blockedRegexes, matchUrlDomain } = require("./sites");
 
-  return article;
-};
 
 module.exports.declutter = async (url) => {
-  const browser = await firefox.launch({
-    args: [],
-    executablePath: process.env.DECLUTTER_BROWSER_PATH || undefined,
-    headless: true,
-  });
-
-  // override User-Agent to use Googlebot
-  const useGoogleBot = useGoogleBotSites.some(function (item) {
-    return typeof item === "string" && matchUrlDomain(item, url);
-  });
+  const { userAgent, headers } = getUserAgent(url);
 
-  let userAgent = undefined;
-  let extraHTTPHeaders = undefined;
-  if (useGoogleBot) {
-    userAgent =
-      "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
-    extraHTTPHeaders = { "X-Forwarded-For": "66.249.66.1" };
-  }
+  const browser = await firefox.launch({ args: [], headless: true });
   const tab = await browser.newPage({
-    viewport: { width: 2000, height: 10000 },
+    extraHTTPHeaders: headers,
     userAgent,
-    extraHTTPHeaders,
-  });
-  await tab.route(/.*/, (route) => {
-    const routeUrl = route.request().url();
-    const blockedDomains = Object.keys(blockedRegexes);
-    const domain = matchUrlDomain(blockedDomains, routeUrl);
-    if (domain && routeUrl.match(blockedRegexes[domain])) {
-      return route.abort();
-    }
-    return route.continue();
-  });
-
-  await tab.addInitScript({
-    path: "bypass-paywalls-chrome/src/js/contentScript.js",
+    viewport: { width: 2000, height: 10000 },
   });
-  await tab.addInitScript({ path: "utils/cosmeticFilter.js" });
 
   try {
-    await tab.goto(url, {
-      timeout: 60000,
-      waitUntil: "domcontentloaded",
+    await tab.route(/.*/, (route) => {
+      const routeUrl = route.request().url();
+      const blockedDomains = Object.keys(blockedRegexes);
+      const domain = matchUrlDomain(blockedDomains, routeUrl);
+      if (domain && routeUrl.match(blockedRegexes[domain])) {
+        return route.abort();
+      }
+      return route.continue();
     });
+    await tab.addInitScript({ path: "bypass-paywalls-chrome/src/js/contentScript.js" });
+    await tab.addInitScript({ path: "scripts/cosmeticFilter.js" });
+    await tab.addInitScript({ path: "scripts/fix-relative-links.js" });
+    await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" });
     await tab.waitForTimeout(2000);
-    await fixRelativeLinks(tab, url);
-
-    let { author, publisher, authorType } = await tab.evaluate((url) => {
-      const meta = {
-        author: "",
-        publisher: new URL(url).host,
-        authorType: "",
-      };
-
-      const ogProps = document.querySelector('meta[property="og:site_name"]');
-      const itemProps = document.querySelectorAll('[itemprop="author"]');
-      const ldJsonTags = document.querySelectorAll(
-        'script[type="application/ld+json"]'
-      );
-
-      if (ogProps) {
-        meta.author = ogProps && ogProps.content ? ogProps.content : "";
-        meta.authorType = "og";
-      }
-
-      Array.from(itemProps).forEach((element) => {
-        meta.publisher = meta.author;
-        meta.author = element.innerText;
-        meta.authorType = "ld";
-      });
-
-      Array.from(ldJsonTags).forEach((ldTag) => {
-        try {
-          const ld = JSON.parse(ldTag.innerHTML);
-          if (ld["@type"] === "Article") {
-            if (ld.author && ld.author["@type"] === "Person") {
-              meta.author = ld.author.name;
-              meta.authorType = "ld";
-            }
-            if (ld.publisher && ld.publisher["@type"] === "Organization") {
-              meta.publisher = ld.publisher.name;
-            }
-          }
-        } catch (e) { }
-      });
-      return meta;
-    }, url);
 
-    const readable = await buildReadableContent(tab, url);
-    if (authorType !== "ld") {
-      publisher = author;
-      author = readable.byline;
-    }
+    let { author, publisher, authorType } = await extractMetadata(tab, url)
 
-    const authorName = cleanHtmlText(
-      [author, publisher].filter((s) => !!s && !!s.trim()).join(" • ")
-    );
+    const body = await tab.content();
+    const doc = new JSDOM(body, { url });
+    const reader = new Readability(doc.window.document);
+    let readable = reader.parse();
 
-    readable.byline = authorName;
-    readable.author = author;
-    readable.publisher = publisher;
-    readable.url = url;
+    readable = addMetadata(readable, authorType, author, publisher, url);
 
     return readable;
   } catch (e) {

+ 21 - 0
utils/disqus-thread.js

@@ -0,0 +1,21 @@
+module.exports.disqusThread = data => {
+	const comments = data.response.posts.reduce((c, post) => ({
+		...c,
+		[post.id.toString()]: {
+			author: post.author.name,
+			authorLink: post.author.profileUrl,
+			date: post.createdAt,
+			text: post.raw_message,
+			score: post.points,
+			children: [],
+			id: post.id.toString(),
+			parent: (post.parent || '').toString(),
+		}
+	}), {});
+	Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => {
+		const comment = comments[id];
+		comments[comment.parent].children.push(comment);
+	});
+	const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]);
+	return parents;
+};

+ 9 - 51
utils/disqus.js

@@ -1,65 +1,24 @@
 const { JSDOM } = require("jsdom");
 const { firefox } = require("playwright");
-const {
-	blockedRegexes,
-	matchUrlDomain,
-	useGoogleBotSites,
-} = require("./sites");
+const { getUserAgent } = require('./user-agent');
+const { disqusThread } = require('./disqus-thread');
 
-const disqusThread = data => {
-	const comments = data.response.posts.reduce((c, post) => ({
-		...c,
-		[post.id.toString()]: {
-			author: post.author.name,
-			authorLink: post.author.profileUrl,
-			date: post.createdAt,
-			text: post.raw_message,
-			score: post.points,
-			children: [],
-			id: post.id.toString(),
-			parent: (post.parent || '').toString(),
-		}
-	}), {});
-	Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => {
-		const comment = comments[id];
-		comments[comment.parent].children.push(comment);
-	});
-	const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]);
-	return parents;
-};
+const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
 
 module.exports.disqus = async (url) => {
-	const browser = await firefox.launch({
-		args: [],
-		executablePath: process.env.DECLUTTER_BROWSER_PATH || undefined,
-		headless: true,
-	});
-
-	// override User-Agent to use Googlebot
-	const useGoogleBot = useGoogleBotSites.some(function (item) {
-		return typeof item === "string" && matchUrlDomain(item, url);
-	});
+	const { userAgent, headers } = getUserAgent(url);
 
-	let userAgent = undefined;
-	let extraHTTPHeaders = undefined;
-	if (useGoogleBot) {
-		userAgent =
-			"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
-		extraHTTPHeaders = { "X-Forwarded-For": "66.249.66.1" };
-	}
+	const browser = await firefox.launch({ args: [], headless: true });
 	const tab = await browser.newPage({
-		viewport: { width: 2000, height: 10000 },
+		extraHTTPHeaders: headers,
 		userAgent,
-		extraHTTPHeaders,
+		viewport: { width: 2000, height: 10000 },
 	});
 
 	try {
-		await tab.goto(url, {
-			timeout: 60000,
-			waitUntil: "domcontentloaded",
-		});
+		await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
 
-		const response = await tab.waitForResponse(response => response.url().includes('https://disqus.com/embed/comments/'));
+		const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED));
 		const text = await response.text();
 		const dom = new JSDOM(text, response.url());
 		const script = dom.window.document.querySelector('#disqus-threadData')
@@ -67,7 +26,6 @@ module.exports.disqus = async (url) => {
 
 		return disqusThread(data);
 	} catch (e) {
-		console.error(e);
 		throw e;
 	} finally {
 		await tab.close();

+ 71 - 0
utils/extract-metadata.js

@@ -0,0 +1,71 @@
+const { JSDOM } = require("jsdom");
+
+module.exports.extractMetadata = extractMetadata;
+module.exports.cleanHtmlText = cleanHtmlText;
+module.exports.addMetadata = addMetadata;
+
+async function extractMetadata(tab, url) {
+	return await tab.evaluate((url) => {
+		const meta = {
+			author: "",
+			publisher: new URL(url).host,
+			authorType: "",
+		};
+
+		const ogProps = document.querySelector('meta[property="og:site_name"]');
+		const itemProps = document.querySelectorAll('[itemprop="author"]');
+		const ldJsonTags = document.querySelectorAll(
+			'script[type="application/ld+json"]'
+		);
+
+		if (ogProps) {
+			meta.author = ogProps && ogProps.content ? ogProps.content : "";
+			meta.authorType = "og";
+		}
+
+		Array.from(itemProps).forEach((element) => {
+			meta.publisher = meta.author;
+			meta.author = element.innerText;
+			meta.authorType = "ld";
+		});
+
+		Array.from(ldJsonTags).forEach((ldTag) => {
+			try {
+				const ld = JSON.parse(ldTag.innerHTML);
+				if (ld["@type"] === "Article") {
+					if (ld.author && ld.author["@type"] === "Person") {
+						meta.author = ld.author.name;
+						meta.authorType = "ld";
+					}
+					if (ld.publisher && ld.publisher["@type"] === "Organization") {
+						meta.publisher = ld.publisher.name;
+					}
+				}
+			} catch (e) { }
+		});
+		return meta;
+	}, url);
+};
+
+function cleanHtmlText(text) {
+	const s = new JSDOM("").window.document.createElement("span");
+	s.innerHTML = text;
+	return s.textContent;
+};
+
+function addMetadata(readable, authorType, author, publisher, url) {
+	if (authorType !== "ld") {
+		publisher = author;
+		author = readable.byline;
+	}
+
+	const authorName = cleanHtmlText(
+		[author, publisher].filter((s) => !!s && !!s.trim()).join(" • ")
+	);
+
+	readable.byline = authorName;
+	readable.author = author;
+	readable.publisher = publisher;
+	readable.url = url;
+	return readable;
+}

+ 80 - 83
utils/sites.js

@@ -1,99 +1,96 @@
 module.exports.blockedRegexes = {
-  "adweek.com": /.+\.lightboxcdn\.com\/.+/,
-  "afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
-  "businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
-  "chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
-  "economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
-  "editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
-  "foreignpolicy.com": /.+\.tinypass\.com\/.+/,
-  "fortune.com": /.+\.tinypass\.com\/.+/,
-  "haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
-  "haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
-  "inquirer.com": /.+\.tinypass\.com\/.+/,
-  "lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
-  "lrb.co.uk": /.+\.tinypass\.com\/.+/,
-  "nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
-  "medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
-  "interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
-  "repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
-  "spectator.co.uk": /.+\.tinypass\.com\/.+/,
-  "spectator.com.au": /.+\.tinypass\.com\/.+/,
-  "telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
-  "thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
-  "thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
-  "thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
-  "thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
-  "wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
-  "historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
-  "barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
-  "irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
-  "elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
-  "sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
-  "latercera.com": /.+\.cxense\.com\/+/,
-  "lesechos.fr": /.+\.tinypass\.com\/.+/,
-  "washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
-  "thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
-  "technologyreview.com": /.+\.blueconic\.net\/.+/,
+	"adweek.com": /.+\.lightboxcdn\.com\/.+/,
+	"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
+	"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
+	"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
+	"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
+	"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
+	"foreignpolicy.com": /.+\.tinypass\.com\/.+/,
+	"fortune.com": /.+\.tinypass\.com\/.+/,
+	"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
+	"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
+	"inquirer.com": /.+\.tinypass\.com\/.+/,
+	"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
+	"lrb.co.uk": /.+\.tinypass\.com\/.+/,
+	"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
+	"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
+	"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
+	"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
+	"spectator.co.uk": /.+\.tinypass\.com\/.+/,
+	"spectator.com.au": /.+\.tinypass\.com\/.+/,
+	"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
+	"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
+	"thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
+	"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
+	"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
+	"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
+	"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
+	"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
+	"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
+	"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
+	"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
+	"latercera.com": /.+\.cxense\.com\/+/,
+	"lesechos.fr": /.+\.tinypass\.com\/.+/,
+	"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
+	"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
+	"technologyreview.com": /.+\.blueconic\.net\/.+/,
 };
 
 module.exports.useGoogleBotSites = [
-  "adelaidenow.com.au",
-  "barrons.com",
-  "couriermail.com.au",
-  "dailytelegraph.com.au",
-  "fd.nl",
-  "genomeweb.com",
-  "haaretz.co.il",
-  "haaretz.com",
-  "heraldsun.com.au",
-  "mexiconewsdaily.com",
-  "ntnews.com.au",
-  "quora.com",
-  "seekingalpha.com",
-  "telegraph.co.uk",
-  "theaustralian.com.au",
-  "themarker.com",
-  "themercury.com.au",
-  "thenational.scot",
-  "thetimes.co.uk",
-  "wsj.com",
-  "kansascity.com",
-  "republic.ru",
-  "nzz.ch",
-  "handelsblatt.com",
-  "washingtonpost.com",
-  "df.cl",
+	"adelaidenow.com.au",
+	"barrons.com",
+	"couriermail.com.au",
+	"dailytelegraph.com.au",
+	"fd.nl",
+	"genomeweb.com",
+	"haaretz.co.il",
+	"haaretz.com",
+	"heraldsun.com.au",
+	"mexiconewsdaily.com",
+	"ntnews.com.au",
+	"quora.com",
+	"seekingalpha.com",
+	"telegraph.co.uk",
+	"theaustralian.com.au",
+	"themarker.com",
+	"themercury.com.au",
+	"thenational.scot",
+	"thetimes.co.uk",
+	"wsj.com",
+	"kansascity.com",
+	"republic.ru",
+	"nzz.ch",
+	"handelsblatt.com",
+	"washingtonpost.com",
+	"df.cl",
 ];
 
 function matchDomain(domains, hostname) {
-  let matchedDomain = false;
-  if (!hostname) {
-    hostname = window.location.hostname;
-  }
-  if (typeof domains === "string") {
-    domains = [domains];
-  }
-  domains.some(
-    (domain) =>
-      (hostname === domain || hostname.endsWith("." + domain)) &&
-      (matchedDomain = domain)
-  );
-  return matchedDomain;
+	let matchedDomain = false;
+	if (typeof domains === "string") {
+		domains = [domains];
+	}
+	domains.some(
+		(domain) =>
+			(hostname === domain || hostname.endsWith("." + domain)) &&
+			(matchedDomain = domain)
+	);
+	return matchedDomain;
 }
 
 function matchUrlDomain(domains, url) {
-  return matchDomain(domains, urlHost(url));
+	return matchDomain(domains, urlHost(url));
 }
 
 function urlHost(url) {
-  if (url && url.startsWith("http")) {
-    try {
-      return new URL(url).hostname;
-    } catch (e) {
-      console.log(`url not valid: ${url} error: ${e}`);
-    }
-  }
-  return url;
+	if (url && url.startsWith("http")) {
+		try {
+			return new URL(url).hostname;
+		} catch (e) {
+			console.log(`url not valid: ${url} error: ${e}`);
+		}
+	}
+	return url;
 }
 
 module.exports.matchDomain = matchDomain;

+ 18 - 0
utils/user-agent.js

@@ -0,0 +1,18 @@
+const { googleBot } = require('./constants');
+const { matchUrlDomain, useGoogleBotSites } = require("./sites");
+
+module.exports.getUserAgent = (url) => {
+	const useGoogleBot = useGoogleBotSites.some(function (item) {
+		return typeof item === "string" && matchUrlDomain(item, url);
+	});
+
+	if (!useGoogleBot) {
+		return {};
+	}
+	return {
+		userAgent: googleBot.userAgent,
+		headers: {
+			"X-Forwarded-For": googleBot.ip
+		}
+	}
+};