Refactor of extract regex

2 years ago · ced1c66e4d
parent 291700554e
commit ced1c66e4d
3 changed files with 18 additions and 6 deletions
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -215,15 +215,25 @@ class perform_site_check():
        if len(extract_text) > 0:
            regex_matched_output = []
            for s_re in extract_text:
-                result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
-                                    flags=re.MULTILINE | re.DOTALL | re.LOCALE)
+                result = re.findall(s_re.encode('utf8'), stripped_text_from_html, flags=re.DOTALL)
                if result:
-                    regex_matched_output = regex_matched_output + result
-
+                    for l in result:
+                        if type(l) is tuple:
+                            #@todo - some formatter option default (between groups)
+                            regex_matched_output += list(l) + [b'\n']
+                        else:
+                            # @todo - some formatter option default (between each ungrouped result)
+                            regex_matched_output += [l] + [b'\n']
+
+            # Now we will only show what the regex matched
+            stripped_text_from_html = b''
+            text_content_before_ignored_filter = b''
            if regex_matched_output:
-                stripped_text_from_html = b'\n'.join(regex_matched_output)
+                # @todo some formatter for presentation?
+                stripped_text_from_html = b''.join(regex_matched_output)
                text_content_before_ignored_filter = stripped_text_from_html

+
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@ -64,6 +64,7 @@ def test_check_filter_and_regex_extract(client, live_server):
    )
    assert b"1 Imported" in res.data

+    time.sleep(1)
    # Trigger a check
    client.get(url_for("form_watch_checknow"), follow_redirects=True)

@ -86,6 +87,8 @@ def test_check_filter_and_regex_extract(client, live_server):

    assert b"Updated watch." in res.data

+    time.sleep(2)
+
    # Check it saved
    res = client.get(
        url_for("edit_page", uuid="first"),
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -113,7 +113,6 @@ class update_worker(threading.Thread):
                        err_text = "Page request from server didnt respond correctly"
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                           'last_check_status': e.status_code})
-
                    except Exception as e:
                        self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})