From 98f6f4619f094e3af30f0987cd662a9c7d830f00 Mon Sep 17 00:00:00 2001
From: Leigh Morresi <275001+dgtlmoon@users.noreply.github.com>
Date: Sat, 30 Jan 2021 10:14:19 +0100
Subject: [PATCH] Switch to inscriptis prepare config backend struct

---
 backend/fetch_site_status.py    | 57 ++++++++++++++++++---------------
 backend/store.py                | 18 +++++++++--
 backend/templates/settings.html | 54 +++++++++++++++++++++++++++++++
 requirements.txt                |  3 +-
 4 files changed, 103 insertions(+), 29 deletions(-)
 create mode 100644 backend/templates/settings.html

diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py
index fc6b2462..cb43c2e5 100644
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@@ -6,7 +6,7 @@ import os
 import re
 import html2text
 from urlextract import URLExtract
-
+from inscriptis import get_text
 
 # Hmm Polymorphism datastore, thread, etc
 class perform_site_check(Thread):
@@ -36,7 +36,6 @@ class perform_site_check(Thread):
             f.write(output)
             f.close()
 
-
     def save_response_stripped_output(self, output):
         fname = "{}/{}.stripped.txt".format(self.output_path, self.timestamp)
         with open(fname, 'w') as f:
@@ -47,49 +46,56 @@ class perform_site_check(Thread):
 
     def run(self):
 
-        # Default headers
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,cs;q=0.7'
-        }
-
         extra_headers = self.datastore.get_val(self.uuid, 'headers')
-        headers.update(extra_headers)
+
+        # Tweak the base config with the per-watch ones
+        request_headers = self.datastore.data['settings']['headers'].copy()
+        request_headers.update(extra_headers)
 
         print("Checking", self.url)
+        print(request_headers)
 
         self.ensure_output_path()
 
         try:
-            r = requests.get(self.url, headers=headers, timeout=15, verify=False)
-            stripped_text_from_html = html2text.html2text(r.text)
+            timeout = self.datastore.data['settings']['requests']['timeout']
+        except KeyError:
+            # @todo yeah this should go back to the default value in store.py, but this whole object should abstract off it
+            timeout = 15
+
+        try:
+            r = requests.get(self.url,
+                             headers=request_headers,
+                             timeout=timeout,
+                             verify=False)
+
+            stripped_text_from_html = get_text(r.text)
+
 
             # @todo This should be a config option.
             # Many websites include junk in the links, trackers, etc.. Since we are really a service all about text changes..
 
-            extractor = URLExtract()
-            urls = extractor.find_urls(stripped_text_from_html)
+# inscriptis handles this much cleaner, probably not needed..
+#            extractor = URLExtract()
+#            urls = extractor.find_urls(stripped_text_from_html)
             # Remove the urls, longest first so that we dont end up chewing up bigger links with parts of smaller ones.
-            if urls:
-                urls.sort(key=len, reverse=True)
-
-                for url in urls:
-                    # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
-                    if "://" in url:
-                        #print ("Stripping link", url)
-                        stripped_text_from_html = stripped_text_from_html.replace(url, '')
+#            if urls:
+#                urls.sort(key=len, reverse=True)
+#                for url in urls:
+#                    # Sometimes URLExtract will consider something like 'foobar.com' as a link when that was just text.
+#                    if "://" in url:
+#                        # print ("Stripping link", url)
+#                        stripped_text_from_html = stripped_text_from_html.replace(url, '')
 
 
 
         # Usually from networkIO/requests level
-        except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
+        except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
             self.datastore.update_watch(self.uuid, 'last_error', str(e))
             print(str(e))
 
         except requests.exceptions.MissingSchema:
-            print ("Skipping {} due to missing schema/bad url".format(self.uuid))
+            print("Skipping {} due to missing schema/bad url".format(self.uuid))
 
         # Usually from html2text level
         except UnicodeDecodeError as e:
@@ -123,6 +129,5 @@ class perform_site_check(Thread):
                 history.update(dict([(self.timestamp, output_filepath)]))
                 self.datastore.update_watch(self.uuid, 'history', history)
 
-
         self.datastore.update_watch(self.uuid, 'last_checked', int(time.time()))
         pass
diff --git a/backend/store.py b/backend/store.py
index 4f509de2..4130a66a 100644
--- a/backend/store.py
+++ b/backend/store.py
@@ -10,7 +10,19 @@ class ChangeDetectionStore:
 
     def __init__(self):
         self.data = {
-            'watching': {}
+            'watching': {},
+            'settings': {
+                'headers': {
+                    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Accept-Language': 'en-GB,en-US;q=0.9,en;'
+                },
+                'requests': {
+                    'timeout': 15, # Default 15 seconds
+                    'max_seconds_from_last_check': 3 * 60 * 60 # Default 3 hours
+                }
+            }
         }
 
 
@@ -26,10 +38,12 @@ class ChangeDetectionStore:
             'history' : {} # Dict of timestamp and output stripped filename
         }
 
+
         try:
             with open('/datastore/url-watches.json') as json_file:
+                from_disk = json.load(json_file)
 
-                self.data.update(json.load(json_file))
+                self.data.update(from_disk)
 
                 # Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future.
                 # @todo pretty sure theres a python we todo this with an abstracted(?) object!
diff --git a/backend/templates/settings.html b/backend/templates/settings.html
new file mode 100644
index 00000000..54ebaeb4
--- /dev/null
+++ b/backend/templates/settings.html
@@ -0,0 +1,54 @@
+{% extends 'base.html' %}
+
+{% block content %}
+<div class="edit-form">
+
+
+    <form class="pure-form pure-form-stacked" action="/api/update?uuid={{uuid}}" method="POST">
+        <fieldset>
+            <div class="pure-control-group">
+                <label for="url">URL</label>
+                <input type="url" id="url" required="" placeholder="https://..." name="url" value="{{ watch.url}}"
+                       size="50"/>
+                <span class="pure-form-message-inline">This is a required field.</span>
+            </div>
+            <div class="pure-control-group">
+                <label for="tag">Tag</label>
+                <input type="text" placeholder="tag" size="10" id="tag" name="tag" value="{{ watch.tag}}"/>
+            </div>
+
+            <fieldset class="pure-group">
+                <label for="headers">Extra request headers</label>
+
+                <textarea id=headers name="headers" class="pure-input-1-2" placeholder="Example
+Cookie: foobar
+User-Agent: wonderbra 1.0"
+                          style="width: 100%;
+                            font-family:monospace;
+                            white-space: pre;
+                            overflow-wrap: normal;
+                            overflow-x: scroll;" rows="5">{% for key, value in watch.headers.items() %}{{ key }}: {{ value }}
+{% endfor %}</textarea>
+                <br/>
+
+            </fieldset>
+            <div class="pure-control-group">
+                <button type="submit" class="pure-button pure-button-primary">Save</button>
+            </div>
+            <br/>
+
+            <div class="pure-control-group">
+                <a href="/" class="pure-button button-small button-cancel">Cancel</a>
+                <a href="/api/delete?uuid={{uuid}}"
+                   class="pure-button button-small button-error ">Delete</a>
+
+            </div>
+
+
+        </fieldset>
+    </form>
+
+
+</div>
+
+{% endblock %}
diff --git a/requirements.txt b/requirements.txt
index a3b0f3a0..f0121d9f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,7 @@ bleach==3.2.1
 html5lib==0.9999999       # via bleach
 timeago
 html2text
-urlextract
+inscriptis
+
 # @notes
 # - Dont install socketio, it interferes with flask_socketio