Browse Source

capture nzherald header pic/vid.

Jason Schwarzenberger 11 months ago
parent
commit
dd7b7980fc
2 changed files with 34 additions and 5 deletions
  1. 31 3
      utils/config/sites.js
  2. 3 2
      utils/declutter.js

+ 31 - 3
utils/config/sites.js

@@ -54,14 +54,26 @@ module.exports = [
     premium: ' Premium',
     selectors: {
       authorName: '.author-title,.author span',
-      content: '.full-content',
+      content: '.article-main',
       title: 'h1',
       publisher: '.syndicator-name',
       premium: '.premium-content',
-      bad: '.related-header,.related-articles-container,.ad-container,.video-js :not(video)'
+      bad: [
+        '.related-header',
+        'header .text-wrapper',
+        '.pb-f-global-recommend',
+        '.pb-f-utilities-sharebar',
+        '.pb-f-article-related-articles',
+        '.article-offer',
+        '.bio-with-share',
+        '.premium-content',
+        '.related-articles-container',
+        '.ad-container',
+        '.video-js :not(video)'
+      ].join(',')
     },
     timeout: 60000,
-    waitUntil: 'networkidle0'
+    waitUntil: 'domcontentloaded'
   },
   {
     userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
@@ -158,5 +170,21 @@ module.exports = [
     },
     timeout: 30000,
     waitUntil: 'domcontentloaded'
+  },
+  {
+    userAgent: 'Googlebot/2.1 (+http://www.google.com/bot.html)',
+    host: /(www\.)?interest\.co\.nz/,
+    publisher: 'Interest.co.nz',
+    premium: '',
+    selectors: {
+      authorName: 'article .by-line .author-name',
+      content: 'article #content',
+      title: 'article .views-field-field-short-headline, article .story-long-headline',
+      publisher: 'article .posted-in',
+      premium: '',
+      bad: '#related-tags'
+    },
+    timeout: 30000,
+    waitUntil: 'domcontentloaded'
   }
 ];

+ 3 - 2
utils/declutter.js

@@ -141,7 +141,7 @@ const getExtensions = async () => {
 
 const UA = site => {
   if (!site || !site.userAgent) {
-    return 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36';
+    return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36';
   }
   return site.userAgent;
 };
@@ -161,7 +161,7 @@ module.exports = async url => {
     await tab.setUserAgent(UA(site));
     await tab.goto(url, {
       timeout: site ? site.timeout : 60000,
-      waitUntil: site ? site.waitUntil : 'networkidle0'
+      waitUntil: site ? site.waitUntil : 'domcontentloaded'
     });
     await fixRelativeLinks(tab, url);
     await bypass(tab, url);
@@ -211,6 +211,7 @@ module.exports = async url => {
 
     return page.url;
   } catch (e) {
+    console.error(e);
     await tab.close();
     await browser.close();
     throw e;