Browse Source

video support.

Jason Schwarzenberger 1 year ago
parent
commit
acdb3e6895
4 changed files with 1515 additions and 566 deletions
  1. 914 370
      package-lock.json
  2. 4 2
      package.json
  3. 65 12
      utils/extract-metadata.js
  4. 532 182
      yarn.lock

File diff suppressed because it is too large
+ 914 - 370
package-lock.json


+ 4 - 2
package.json

@@ -11,6 +11,7 @@
   "dependencies": {
     "@mozilla/readability": "^0.3.0",
     "body-parser": "^1.19.0",
+    "dompurify": "^2.2.3",
     "express": "^4.17.1",
     "jsdom": "^15.1.1",
     "metascraper": "^5.15.0",
@@ -19,6 +20,7 @@
     "metascraper-description": "^5.15.0",
     "metascraper-image": "^5.15.0",
     "metascraper-logo": "^5.15.0",
+    "metascraper-media-provider": "^5.15.2",
     "metascraper-publisher": "^5.15.0",
     "metascraper-readability": "^5.15.0",
     "metascraper-title": "^5.15.0",
@@ -27,7 +29,7 @@
     "node-fetch": "^2.6.1",
     "playwright": "^1.6.2",
     "sanitize-html": "^2.2.0",
-    "showdown": "^1.9.1",
-    "turndown": "^7.0.0"
+    "turndown": "^7.0.0",
+    "youtube-dl": "~3.0.2"
   }
 }

+ 65 - 12
utils/extract-metadata.js

@@ -9,50 +9,103 @@ const metascraper = require('metascraper')([
 	require('metascraper-publisher')(),
 	require('metascraper-title')(),
 	require('metascraper-url')(),
-	require('metascraper-readability')()
+	require('metascraper-readability')(),
+	require('metascraper-media-provider')()
 ]);
+const youtubedl = require('youtube-dl');
 const Turndown = require('turndown');
-const { Converter } = require('showdown');
-// const { parse } = require("@postlight/mercury-parser");
+const createDOMPurify = require('dompurify');
 
 module.exports.extractReadable = extractReadable;
 module.exports.extractMetadata = extractMetadata;
 
 async function extractReadable(html, url) {
-	const doc = new JSDOM(html, { url });
-	const reader = new Readability(doc.window.document);
+	const metadata = await extractMetadata(html, url);
+	const { window } = new JSDOM(html, { url });
+
+	if (metadata.videos) {
+		replaceVideos(window, metadata.videos);
+	}
+
+	const reader = new Readability(window.document);
 	let readable = reader.parse();
 
 	readable.textContent = undefined;
 	readable.dir = undefined;
 	readable.url = url;
-	readable.meta = await extractMetadata(html, url);
+	readable.meta = metadata;
 
 	readable.author = readable.meta.author;
 	readable.publisher = readable.meta.publisher;
 
-	const byline = doc.window.document.createElement('span');
+	const byline = window.document.createElement('span');
 	byline.innerHTML = [readable.meta.author, readable.meta.publisher].filter((s) => !!s && !!s.trim()).join(" • ");
 	readable.byline = byline.textContent;
 
+	const DOMPurify = createDOMPurify(new JSDOM('').window);
+
 	readable.html = readable.content;
+	readable.content = DOMPurify.sanitize(readable.html);
 	readable.markdown = new Turndown().turndown(readable.content);
-	readable.content = new Converter().makeHtml(readable.markdown);
-
 
 	return readable;
 }
 
 async function extractMetadata(html, url) {
-	const dom = new JSDOM(html, { url });
-	const document = dom.window.document;
+	const { window } = new JSDOM(html, { url });
 
 	const metadata = await metascraper({ html, url });
-	metadata.og = extractOpengraph(document);
+	metadata.og = extractOpengraph(window.document);
+	try {
+		console.log('doing a youtube-dl');
+		metadata.videos = await extractVideos(url);
+	} catch (e) {
+		console.error(e);
+	}
 
 	return metadata;
 };
 
+async function extractVideos(url) {
+	return new Promise((resolve, reject) => {
+		const args = ['-j'];
+		const opts = {};
+		youtubedl.getInfo(url, args, opts, (e, o) => {
+			if (e) {
+				reject(e);
+			}
+			resolve(o instanceof Array ? o : [o]);
+		});
+	});
+}
+
+function replaceVideos(window, videos) {
+	if (!videos) {
+		return;
+	}
+	videos.forEach(info => {
+		const id = info.id.replace(/(^ref\:)/, '');
+		console.log(info.info, id);
+		const $original = window.document.querySelector(`[data-video-id="${info.id}"],[data-video-id="${id}"],[id*="${info.id}"],[id*="${id}"]`)
+		if (!$original) {
+			console.log('no video', info.info, id);
+			return;
+		}
+		const $parent = $original.parentNode;
+		const $fixed = window.document.createElement('figure');
+		const $video = window.document.createElement('video');
+		const $caption = window.document.createElement('figcaption');
+		$video.src = info.url;
+		$video.setAttribute('controls', 'controls');
+		$video.setAttribute('title', info.title);
+		$caption.textContent = info.description;
+		$fixed.appendChild($video);
+		$fixed.appendChild($caption);
+		$parent.insertBefore($fixed, $original);
+		$original.remove();
+	});
+}
+
 function extractOpengraph(document) {
 	const props = document.querySelectorAll('meta[property^="og:"]');
 	return Array.from(props).reduce((meta, prop) => ({

File diff suppressed because it is too large
+ 532 - 182
yarn.lock