Browse Source

switch to my own fork of bypass-paywalls.

Jason Schwarzenberger 9 months ago
parent
commit
9cb3b08464
5 changed files with 103 additions and 72 deletions
  1. 1 1
      scraper/headless/get-details.js
  2. 11 1
      utils/constants.js
  3. 66 61
      utils/sites.js
  4. 24 8
      utils/user-agent.js
  5. 1 1
      vendor/bypass-paywalls-chrome

+ 1 - 1
scraper/headless/get-details.js

@@ -29,7 +29,7 @@ module.exports.getDetails = async (url) => {
 		await tab.addInitScript({ path: "scripts/cosmetic-filter.js" });
 		await tab.addInitScript({ path: "scripts/fix-relative-links.js" });
 		await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
-		await tab.waitForTimeout(5000);
+		await tab.waitForTimeout(3000);
 
 		const html = await tab.content();
 		const readable = await extractReadable(html, url);

+ 11 - 1
utils/constants.js

@@ -1,5 +1,6 @@
-const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
+const googleBotUserAgent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
 const googleBotIp = '66.249.66.1';
+const bingBotUserAgent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)';
 
 module.exports.googleBot = {
 	userAgent: googleBotUserAgent,
@@ -8,4 +9,13 @@ module.exports.googleBot = {
 		'User-Agent': googleBotUserAgent,
 		'X-Forwarded-For': googleBotIp,
 	}
+}
+
+module.exports.bingBot = {
+	userAgent: bingBotUserAgent,
+	ip: undefined,
+	headers: {
+		'User-Agent': bingBotUserAgent,
+		'X-Forwarded-For': undefined,
+	}
 }

+ 66 - 61
utils/sites.js

@@ -1,68 +1,73 @@
 module.exports.blockedRegexes = {
-	"adweek.com": /.+\.lightboxcdn\.com\/.+/,
-	"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
-	"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
-	"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
-	"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
-	"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
-	"foreignpolicy.com": /.+\.tinypass\.com\/.+/,
-	"fortune.com": /.+\.tinypass\.com\/.+/,
-	"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
-	"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
-	"inquirer.com": /.+\.tinypass\.com\/.+/,
-	"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
-	"lrb.co.uk": /.+\.tinypass\.com\/.+/,
-	"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
-	"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
-	"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
-	"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
-	"spectator.co.uk": /.+\.tinypass\.com\/.+/,
-	"spectator.com.au": /.+\.tinypass\.com\/.+/,
-	"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
-	"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
-	"thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
-	"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
-	"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
-	"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
-	"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
-	"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
-	"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
-	"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
-	"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
-	"latercera.com": /.+\.cxense\.com\/+/,
-	"lesechos.fr": /.+\.tinypass\.com\/.+/,
-	"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
-	"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
-	"technologyreview.com": /.+\.blueconic\.net\/.+/,
+	'adweek.com': /.+\.lightboxcdn\.com\/.+/,
+	'afr.com': /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
+	'businessinsider.com': /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
+	'chicagotribune.com': /.+:\/\/.+\.tribdss\.com\//,
+	'economist.com': /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
+	'editorialedomani.it': /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
+	'foreignpolicy.com': /.+\.tinypass\.com\/.+/,
+	'fortune.com': /.+\.tinypass\.com\/.+/,
+	'haaretz.co.il': /haaretz\.co\.il\/htz\/js\/inter\.js/,
+	'haaretz.com': /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
+	'inquirer.com': /.+\.tinypass\.com\/.+/,
+	'lastampa.it': /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
+	'lrb.co.uk': /.+\.tinypass\.com\/.+/,
+	'medscape.com': /.+\.medscapestatic\.com\/.*medscape-library\.js/,
+	'interest.co.nz': /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
+	'repubblica.it': /scripts\.repubblica\.it\/pw\/pw\.js.+/,
+	'spectator.co.uk': /.+\.tinypass\.com\/.+/,
+	'spectator.com.au': /.+\.tinypass\.com\/.+/,
+	'telegraph.co.uk': /.+telegraph\.co\.uk.+martech.+/,
+	'thecourier.com.au': /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
+	'thenation.com': /thenation\.com\/.+\/paywall-script\.php/,
+	'thenational.scot': /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
+	'thewrap.com': /thewrap\.com\/.+\/wallkit\.js/,
+	'wsj.com': /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
+	'historyextra.com': /.+\.evolok\.net\/.+\/authorize\/.+/,
+	'barrons.com': /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
+	'irishtimes.com': /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
+	'elmercurio.com': /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
+	'sloanreview.mit.edu': /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
+	'latercera.com': /.+\.cxense\.com\/+/,
+	'lesechos.fr': /.+\.tinypass\.com\/.+/,
+	'washingtonpost.com': /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
+	'thehindu.com': /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
+	'technologyreview.com': /.+\.blueconic\.net\/.+/,
+	'spectator.us': /(cdn\.cxense\.com\/.+|\.tinypass\.com\/.+)/,
+	'nzherald.co.nz': /.+nzherald\.co\.nz\/(.+\/subs\/p\.min\.js.*|.+\/default\.js.*|.+\/react\.js.*)/,
 };
 
 module.exports.useGoogleBotSites = [
-	"adelaidenow.com.au",
-	"barrons.com",
-	"couriermail.com.au",
-	"dailytelegraph.com.au",
-	"fd.nl",
-	"genomeweb.com",
-	"haaretz.co.il",
-	"haaretz.com",
-	"heraldsun.com.au",
-	"mexiconewsdaily.com",
-	"ntnews.com.au",
-	"quora.com",
-	"seekingalpha.com",
-	"telegraph.co.uk",
-	"theaustralian.com.au",
-	"themarker.com",
-	"themercury.com.au",
-	"thenational.scot",
-	"thetimes.co.uk",
-	"wsj.com",
-	"kansascity.com",
-	"republic.ru",
-	"nzz.ch",
-	"handelsblatt.com",
-	"washingtonpost.com",
-	"df.cl",
+	'adelaidenow.com.au',
+	'barrons.com',
+	'couriermail.com.au',
+	'dailytelegraph.com.au',
+	'fd.nl',
+	'genomeweb.com',
+	'heraldsun.com.au',
+	'lavoixdunord.fr',
+	'mexiconewsdaily.com',
+	'ntnews.com.au',
+	'quora.com',
+	'seekingalpha.com',
+	'telegraph.co.uk',
+	'theaustralian.com.au',
+	'themercury.com.au',
+	'thenational.scot',
+	'thetimes.co.uk',
+	'wsj.com',
+	'kansascity.com',
+	'republic.ru',
+	'nzz.ch',
+	'handelsblatt.com',
+	'washingtonpost.com',
+	'df.cl'
+];
+
+module.exports.useBingBotSites = [
+	'haaretz.co.il',
+	'haaretz.com',
+	'themarker.com'
 ];
 
 function matchDomain(domains, hostname) {

+ 24 - 8
utils/user-agent.js

@@ -1,18 +1,34 @@
-const { googleBot } = require('./constants');
-const { matchUrlDomain, useGoogleBotSites } = require("./sites");
+const { googleBot, bingBot } = require('./constants');
+const { matchUrlDomain, useGoogleBotSites, useBingBotSites } = require("./sites");
 
 module.exports.getUserAgent = (url) => {
 	const useGoogleBot = useGoogleBotSites.some(function (item) {
 		return typeof item === "string" && matchUrlDomain(item, url);
 	});
+	const useBingBot = useBingBotSites.some(function (item) {
+		return typeof item === "string" && matchUrlDomain(item, url);
+	});
 
-	if (!useGoogleBot) {
-		return {};
+	if (useGoogleBot) {
+		return {
+			userAgent: googleBot.userAgent,
+			headers: {
+				"X-Forwarded-For": googleBot.ip
+			}
+		}
 	}
-	return {
-		userAgent: googleBot.userAgent,
-		headers: {
-			"X-Forwarded-For": googleBot.ip
+
+	if (useBingBot) {
+		return {
+			userAgent: bingBot.userAgent,
+			headers: {
+				"X-Forwarded-For": bingBot.ip
+			}
 		}
 	}
+
+	return {
+		userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
+		headers: {}
+	};
 };

+ 1 - 1
vendor/bypass-paywalls-chrome

@@ -1 +1 @@
-Subproject commit 74b11cd8b6c7dc0a73cf09416fd95abe76adb3a8
+Subproject commit 42b901803543a656d9b584681dbf7fcdf13e5650