New feature - "Extract text" filter ability (#624)

3 years ago · 4101ae00c6
parent 62f14df3cb
commit 4101ae00c6
6 changed files with 160 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,5 +8,6 @@ __pycache__
 build
 dist
 venv
+test-datastore
 *.egg-info*
 .vscode/settings.json
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -204,6 +204,20 @@ class perform_site_check():
        else:
            stripped_text_from_html = stripped_text_from_html.encode('utf8')

+        # 615 Extract text by regex
+        extract_text = watch.get('extract_text', [])
+        if len(extract_text) > 0:
+            regex_matched_output = []
+            for s_re in extract_text:
+                result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
+                                    flags=re.MULTILINE | re.DOTALL | re.LOCALE)
+                if result:
+                    regex_matched_output.append(result[0])
+
+            if regex_matched_output:
+                stripped_text_from_html = b'\n'.join(regex_matched_output)
+                text_content_before_ignored_filter = stripped_text_from_html
+
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
@ -221,6 +235,7 @@ class perform_site_check():
            # Yeah, lets block first until something matches
            blocked_by_not_found_trigger_text = True
            # Filter and trigger works the same, so reuse it
+            # It should return the line numbers that match
            result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
                                                  wordlist=watch['trigger_text'],
                                                  mode="line numbers")
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -223,7 +223,7 @@ class validateURL(object):
        except validators.ValidationFailure:
            message = field.gettext('\'%s\' is not a valid URL.' % (field.data.strip()))
            raise ValidationError(message)
-        
+
 class ValidateListRegex(object):
    """
    Validates that anything that looks like a regex passes as a regex
@ -330,6 +330,9 @@ class watchForm(commonSettingsForm):
    css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='')

    subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
+
+    extract_text = StringListField('Extract text', [ValidateListRegex()])
+
    title = StringField('Title', default='')

    ignore_text = StringListField('Ignore text', [ValidateListRegex()])
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -35,7 +35,8 @@ class model(dict):
            'notification_title': default_notification_title,
            'notification_body': default_notification_body,
            'notification_format': default_notification_format,
-            'css_filter': "",
+            'css_filter': '',
+            'extract_text': [],  # Extract text by regex after filters
            'subtractive_selectors': [],
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'fetch_backend': None,
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -199,6 +199,17 @@ nav
                        </span>
                    </div>
                </fieldset>
+                <fieldset>
+                    <div class="pure-control-group">
+                        {{ render_field(form.extract_text, rows=5, placeholder="/some.regex\d{2}/ case-insensitive regex") }}
+                        <span class="pure-form-message-inline">
+                    <ul>
+                        <li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
+                        <li>One line per regular-expression.</li>
+                    </ul>
+                        </span>
+                    </div>
+                </fieldset>
            </div>

            <div class="tab-pane-inner visual-selector-ui" id="visualselector">
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@ -0,0 +1,127 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from .util import live_server_setup
+
+from ..html_tools import *
+
+
+def set_original_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <div id="sametext">Some text thats the same</div>
+     <div id="changetext">Some text that will change</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+    return None
+
+
+def set_modified_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>which has this one new line</p>
+     </br>
+     So let's see what happens.  </br>
+     <div id="sametext">Some text thats the same</div>
+     <div id="changetext">Some text that did change ( 1000 online <br/> 80 guests)</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None
+
+
+def test_check_filter_and_regex_extract(client, live_server):
+    sleep_time_for_fetch_thread = 3
+
+    live_server_setup(live_server)
+    css_filter = "#changetext"
+
+    set_original_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": css_filter,
+              'extract_text': '\d+ online\n\d+ guests',
+              "url": test_url,
+              "tag": "",
+              "headers": "",
+              'fetch_backend': "html_requests"
+              },
+        follow_redirects=True
+    )
+
+    assert b"Updated watch." in res.data
+
+    # Check it saved
+    res = client.get(
+        url_for("edit_page", uuid="first"),
+    )
+    assert b'\d+ online' in res.data
+
+    # Trigger a check
+#    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    #  Make a change
+    set_modified_response()
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # It should have 'unviewed' still
+    # Because it should be looking at only that 'sametext' id
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
+
+    # Check HTML conversion detected and workd
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    # Class will be blank for now because the frontend didnt apply the diff
+    assert b'<div class="">1000 online' in res.data
+    
+    # Both regexs should be here
+    assert b'<div class="">80 guests' in res.data
+
+    # Should not be here
+    assert b'Some text that did change' not in res.data