Jason Schwarzenberger 11 months ago
commit
63a47ce895
9 changed files with 1309 additions and 0 deletions
  1. 4 0
      .gitignore
  2. 46 0
      index.js
  3. 1080 0
      package-lock.json
  4. 17 0
      package.json
  5. 55 0
      utils/bypass.js
  6. 27 0
      utils/dom-node.js
  7. 22 0
      utils/ld.js
  8. 22 0
      utils/loop-parse.js
  9. 36 0
      utils/telegraph.js

+ 4 - 0
.gitignore

@@ -0,0 +1,4 @@
+# Ignore Mac system files
+node_modules
+.DS_store
+*.zip

+ 46 - 0
index.js

@@ -0,0 +1,46 @@
+const express = require('express');
+const bodyParser = require('body-parser');
+
+const bypass = require('./utils/bypass');
+
+const port = process.env.NODE_PORT || 3000;
+const app = express();
+
+app.use(bodyParser.json()); // for parsing application/json
+app.use(bodyParser.urlencoded({ extended: true })); // for parsing application/x-www-form-urlencoded
+
+app.post('/', async (req, res, next) => {
+  const url = await bypass(req.body.url);
+  if (req.body.redirect) {
+    res.redirect(url);
+    return next();
+  }
+  res.send(url);
+  return next();
+});
+
+app.get('/', async (req, res) => {
+  res.send(`<html><body><pre>
+
+  <form action="/" method="POST" accept-charset="UTF-8">
+	  <input name="url" pattern="https:\/\/(www\.)?nzherald\.co\.nz/.*" />
+	  <button type="submit">Get Link</button>
+	  <button value="true" name="redirect" type="submit">Redirect</button>
+	</form>
+</pre></body></html>`);
+});
+
+app.get('*', async (req, res, next) => {
+  const queryString = Object.keys(req.query)
+    .map(k => `${k}=${req.query[k]}`)
+    .join('&');
+
+  const path = req.path.substring(1);
+  const host = !/^https?:\/\/(www\.)?nzherald.co.nz/.test(path) ? 'https://www.nzherald.co.nz' : '';
+
+  const url = await bypass(host + path + '?' + queryString);
+  res.redirect(url);
+  return next();
+});
+
+app.listen(port, () => console.log(`Example app listening on port ${port}!`));

File diff suppressed because it is too large
+ 1080 - 0
package-lock.json


+ 17 - 0
package.json

@@ -0,0 +1,17 @@
+{
+  "name": "nzhp",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "body-parser": "^1.19.0",
+    "express": "^4.17.1",
+    "jsdom": "^15.1.1",
+    "node-fetch": "^2.6.0"
+  }
+}

+ 55 - 0
utils/bypass.js

@@ -0,0 +1,55 @@
+const { JSDOM } = require('jsdom');
+
+const { parse } = require('./ld');
+const telegraph = require('./telegraph');
+const domToNode = require('./dom-node');
+
+const findArticleSchema = document => {
+  return Array.from(document.querySelectorAll('script[type="application/ld+json"]'))
+    .map(e => parse(e.textContent))
+    .find(l => {
+      if (!l) {
+        return false;
+      }
+      const isSchema = l['@context'] === 'http://schema.org';
+      const isArticle = l['@type'] === 'Article';
+      return isSchema && isArticle;
+    });
+};
+
+const cleanHtmlText = text => {
+  const s = new JSDOM('').window.document.createElement('span');
+  s.innerHTML = text;
+  return s.textContent;
+};
+
+const buildContent = (document, url) => {
+  const node = document.querySelector('.full-content,.premium-content').cloneNode({ deep: true });
+  const bad = node.querySelectorAll('meta,script,style,.related-header,.related-articles-container,.ad-container');
+  [].forEach.call(bad, b => b.remove());
+
+  const source = document.createElement('p');
+  source.innerHTML = `Original article: <a href="${url}"">${url}</a>.`;
+  node.prepend(source);
+
+  return domToNode(node).children.filter(m => !m.trim || m.trim().length > 0);
+};
+
+module.exports = async url => {
+  const DOM = await JSDOM.fromURL(url);
+  const document = DOM.window.document;
+
+  const ld = findArticleSchema(document);
+
+  const content = buildContent(document, url);
+  const title = cleanHtmlText(ld.headline);
+
+  const account = await telegraph.createAccount({
+    author_name: ld.author.name,
+    author_url: url,
+    short_name: ld.author.name
+  });
+  const page = await telegraph.createPage(title, content, account);
+
+  return page.url;
+};

+ 27 - 0
utils/dom-node.js

@@ -0,0 +1,27 @@
+module.exports = function domToNode(domNode) {
+  if (domNode.nodeType == domNode.TEXT_NODE) {
+    return domNode.data;
+  }
+  if (domNode.nodeType != domNode.ELEMENT_NODE) {
+    return false;
+  }
+  var nodeElement = {};
+  nodeElement.tag = domNode.tagName.toLowerCase();
+  for (var i = 0; i < domNode.attributes.length; i++) {
+    var attr = domNode.attributes[i];
+    if (attr.name == 'href' || attr.name == 'src') {
+      if (!nodeElement.attrs) {
+        nodeElement.attrs = {};
+      }
+      nodeElement.attrs[attr.name] = attr.value;
+    }
+  }
+  if (domNode.childNodes.length > 0) {
+    nodeElement.children = [];
+    for (var i = 0; i < domNode.childNodes.length; i++) {
+      var child = domNode.childNodes[i];
+      nodeElement.children.push(domToNode(child));
+    }
+  }
+  return nodeElement;
+};

+ 22 - 0
utils/ld.js

@@ -0,0 +1,22 @@
+const { parseJson } = require('./loop-parse');
+
+const parse = text => {
+  const input = text.replace(/[\r\n]/g, '');
+  const schemaInput = input.replace(/[\ ]/g, '');
+  const schema = parseJson(schemaInput);
+  const json = parseJson(input);
+  if (!json) {
+    return schema;
+  }
+  if (!schema) {
+    return json;
+  }
+  return {
+    ...json,
+    '@context': schema['@context'],
+    '@type': schema['@type']
+  };
+};
+
+module.exports.parse = parse;
+module.exports.parseArray = textArray => textArray.map(parse);

+ 22 - 0
utils/loop-parse.js

@@ -0,0 +1,22 @@
+const loopParse = (json, iterations) => {
+  try {
+    return JSON.parse(json);
+  } catch (e) {
+    let column = 0;
+    let m = e.message.match(/position (\d+) ?/);
+    if (m) {
+      column = Number(m[1]);
+    } else {
+      m = e.message.match(/column (\d+) ?/);
+      column = Number(m[1]) - 1;
+    }
+    const left = json.substring(0, column).replace(/,$/, '');
+    const right = json.substring(column + 1);
+    if (iterations <= 0) {
+      return null;
+    }
+    return loopParse(left + right, iterations - 1);
+  }
+};
+
+module.exports.parseJson = text => loopParse(text, text.length + 1);

+ 36 - 0
utils/telegraph.js

@@ -0,0 +1,36 @@
+const fetch = require('node-fetch');
+
+const TELEGRAPH_URL = 'https://api.telegra.ph/';
+
+const post = async (url, body) => {
+  return await fetch(url, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json'
+    },
+    body: JSON.stringify(body)
+  });
+};
+
+module.exports.createAccount = async ({ author_name, author_url, short_name }) => {
+  const request = await post(`${TELEGRAPH_URL}/createAccount`, {
+    author_name,
+    author_url,
+    short_name: short_name.substring(0, 31)
+  });
+  const body = await request.json();
+  return body.result;
+};
+
+module.exports.createPage = async (title, content, { author_name, author_url, access_token }) => {
+  const request = await post(`${TELEGRAPH_URL}/createPage`, {
+    access_token,
+    author_name,
+    author_url,
+    content,
+    title,
+    return_content: false
+  });
+  const body = await request.json();
+  return body.result;
+};