Browse Source

add scripts to simple scraper.

Jason Schwarzenberger 6 months ago
parent
commit
d50481b83c

+ 2 - 2
scraper/headless/get-details.js

@@ -26,8 +26,8 @@ module.exports.getDetails = async (url) => {
 		});
 
 		await tab.addInitScript({ path: "vendor/bypass-paywalls-chrome/src/js/contentScript.js" });
-		await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filter.js" });
-		await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
+		await tab.addInitScript({ path: "scripts/cosmetic-filter.js" });
+		await tab.addInitScript({ path: "scripts/fix-relative-links.js" });
 		await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" });
 		await tab.waitForTimeout(2000);
 

+ 37 - 1
scraper/simple/get-details.js

@@ -1,8 +1,30 @@
 const fetch = require('node-fetch');
+const { JSDOM } = require('jsdom');
+const { Script, createContext } = require("vm");
+const { readFile } = require('fs');
+
 
 const { getUserAgent } = require('../../utils/user-agent');
 const { extractReadable } = require('../../utils/extract-metadata');
 
+async function runScript(filename, context) {
+	try {
+		return await new Promise((resolve, reject) => {
+			readFile(filename, {}, (e, content) => {
+				if (e) {
+					reject(e);
+				}
+				const script = new Script(content, { filename: `(internal):${filename}` })
+				script.runInContext(context);
+				resolve(context);
+			});
+		})
+	} catch (e) {
+		console.error(e);
+	}
+	return context;
+}
+
 module.exports.getDetails = async (url) => {
 	try {
 		const { userAgent, headers } = getUserAgent(url);
@@ -15,7 +37,21 @@ module.exports.getDetails = async (url) => {
 		if (!response.ok) {
 			throw response.statusText;
 		}
-		const html = await response.text();
+
+		const { window } = new JSDOM(await response.text(), { url });
+		window.window = window;
+		window.setTimeout = cb => cb();
+		window.setInterval = cb => cb();
+		const context = createContext(window);
+
+		await runScript('vendor/bypass-paywalls-chrome/src/js/contentScript.js', context);
+		await runScript('scripts/cosmetic-filter.js', context);
+		await runScript('scripts/fix-relative-links.js', context);
+
+		const script = new Script(`window.dispatchEvent(new window.Event('DOMContentLoaded'));`);
+		script.runInContext(context);
+		const html = context.document.querySelector('html').innerHTML;
+
 		const readable = await extractReadable(html, url);
 		return readable;
 	} catch (e) {

+ 1 - 1
scraper/headless/scripts/cosmetic-filter.js

@@ -1,4 +1,4 @@
-window.addEventListener('DOMContentLoaded', function (event) {
+window.addEventListener('DOMContentLoaded', function () {
   removeHiddenElements();
 
   if (matchDomain("stuff.co.nz")) {

+ 1 - 1
scraper/headless/scripts/fix-relative-links.js

@@ -1,4 +1,4 @@
-window.addEventListener('DOMContentLoaded', function (event) {
+window.addEventListener('DOMContentLoaded', function () {
 	const { host, protocol } = window.location;
 	const url = `${protocol}//${host}`;
 	[