CSS selector filter (#73)

* Re #9 CSS Selector filtering, Adding test for #9
4 years ago · 2346b42ef2
parent 1a0c3f1250
commit 2346b42ef2
6 changed files with 144 additions and 7 deletions
--- a/backend/init.py
+++ b/backend/init.py
@ -386,6 +386,17 @@ def changedetection_app(conig=None, datastore_o=None):
                if len(datastore.data['watching'][uuid]['history']):
                    update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)

+
+            # CSS Filter
+            css_filter = request.form.get('css_filter')
+            if css_filter:
+                datastore.data['watching'][uuid]['css_filter'] = css_filter.strip()
+
+                # Reset the previous_md5 so we process a new snapshot including stripping ignore text.
+                if len(datastore.data['watching'][uuid]['history']):
+                    update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
+
+
            validators.url(url)  # @todo switch to prop/attr/observer
            datastore.data['watching'][uuid].update(update_obj)
            datastore.needs_write = True
@ -876,7 +887,7 @@ def ticker_thread_check_time_launch_checks():
                if not uuid in running_uuids and uuid not in update_q.queue:
                    update_q.put(uuid)

-            time.sleep(1)
+            time.sleep(0.1)

        # Should be low so we can break this out in testing
        app.config.exit.wait(1)
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -66,25 +66,36 @@ class perform_site_check():
                             timeout=timeout,
                             verify=False)

-            stripped_text_from_html = get_text(r.text)
+            # CSS Filter
+            css_filter = self.datastore.data['watching'][uuid]['css_filter']
+            if css_filter and len(css_filter.strip()):
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(r.content, "html.parser")
+                stripped_text_from_html = ""
+                for item in soup.select(css_filter):
+                    text = str(item.get_text())+"\n"
+                    stripped_text_from_html += text
+
+            else:
+                stripped_text_from_html = get_text(r.text)

        # Usually from networkIO/requests level
        except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
            update_obj["last_error"] = str(e)
-
            print(str(e))

        except requests.exceptions.MissingSchema:
            print("Skipping {} due to missing schema/bad url".format(uuid))

        # Usually from html2text level
-        except UnicodeDecodeError as e:
-
+        except Exception as e:
+            #        except UnicodeDecodeError as e:
            update_obj["last_error"] = str(e)
            print(str(e))
            # figure out how to deal with this cleaner..
            # 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte

+
        else:
            # We rely on the actual text in the html output.. many sites have random script vars etc,
            # in the future we'll implement other mechanisms.
--- a/backend/store.py
+++ b/backend/store.py
@ -61,7 +61,8 @@ class ChangeDetectionStore:
            'headers': {},  # Extra headers to send
            'history': {},  # Dict of timestamp and output stripped filename
            'ignore_text': [], # List of text to ignore when calculating the comparison checksum
-            'notification_urls': [] # List of URLs to add to the notification Queue (Usually AppRise)
+            'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
+            'css_filter': "",
        }

        if path.isfile('backend/source.txt'):
--- a/backend/templates/edit.html
+++ b/backend/templates/edit.html
@ -24,7 +24,13 @@
                       size="5"/>
                <span class="pure-form-message-inline">Minimum 1 minute between recheck</span>
            </div>
-
+            </br>
+            <div class="pure-control-group">
+                <label for="minutes">CSS Filter</label>
+                <input type="text" id="css_filter" name="css_filter" value="{{watch.css_filter}}"
+                       size="25"/>
+                <span class="pure-form-message-inline">Limit text to this CSS rule, all matching CSS is included.</span>
+            </div>
            <!-- @todo: move to tabs --->
            <fieldset class="pure-group">
                <label for="ignore-text">Ignore text</label>
--- a/backend/tests/test_css_selector.py
+++ b/backend/tests/test_css_selector.py
@ -0,0 +1,102 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from . util import live_server_setup
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+def set_original_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <div id="sametext">Some text thats the same</div>
+     <div id="changetext">Some text that will change</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/output.txt", "w") as f:
+        f.write(test_return_data)
+    return None
+
+def set_modified_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>which has this one new line</p>
+     </br>
+     So let's see what happens.  </br>
+     <div id="sametext">Some text thats the same</div>
+     <div id="changetext">Some text that changes</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/output.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None
+
+
+def test_check_markup_css_filter_restriction(client, live_server):
+    sleep_time_for_fetch_thread = 3
+
+    css_filter = "#sametext"
+
+    set_original_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": ""},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+    # Check it saved
+    res = client.get(
+        url_for("edit_page", uuid="first"),
+    )
+    assert bytes(css_filter.encode('utf-8')) in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+    #  Make a change
+    set_modified_response()
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # It should have 'unviewed' still
+    # Because it should be looking at only that 'sametext' id
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
--- a/requirements.txt
+++ b/requirements.txt
@ -11,4 +11,10 @@ feedgen ~= 0.9
 flask-login ~= 0.5
 pytz
 urllib3
+
+# Notification library
 apprise ~= 0.9
+
+# Used for CSS filtering
+bs4
+