diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 23ea2dce..7a453abd 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -601,6 +601,16 @@ def changedetection_app(config=None, datastore_o=None):
if datastore.proxy_list is not None and form.data['proxy'] == '':
extra_update_obj['proxy'] = None
+ # Unsetting all filter_text methods should make it go back to default
+ # This particularly affects tests running
+ if 'filter_text_added' in form.data and not form.data.get('filter_text_added') \
+ and 'filter_text_replaced' in form.data and not form.data.get('filter_text_replaced') \
+ and 'filter_text_removed' in form.data and not form.data.get('filter_text_removed'):
+ extra_update_obj['filter_text_added'] = True
+ extra_update_obj['filter_text_replaced'] = True
+ extra_update_obj['filter_text_removed'] = True
+
+
datastore.data['watching'][uuid].update(form.data)
datastore.data['watching'][uuid].update(extra_update_obj)
diff --git a/changedetectionio/diff.py b/changedetectionio/diff.py
index 2b566ffc..c3d8b0cd 100644
--- a/changedetectionio/diff.py
+++ b/changedetectionio/diff.py
@@ -10,7 +10,7 @@ def same_slicer(l, a, b):
return l[a:b]
# like .compare but a little different output
-def customSequenceMatcher(before, after, include_equal=False, include_removed=True, include_added=True):
+def customSequenceMatcher(before, after, include_equal=False, include_removed=True, include_added=True, include_replaced=True, include_change_type_prefix=True):
cruncher = difflib.SequenceMatcher(isjunk=lambda x: x in " \\t", a=before, b=after)
# @todo Line-by-line mode instead of buncghed, including `after` that is not in `before` (maybe unset?)
@@ -19,19 +19,23 @@ def customSequenceMatcher(before, after, include_equal=False, include_removed=Tr
g = before[alo:ahi]
yield g
elif include_removed and tag == 'delete':
- g = ["(removed) " + i for i in same_slicer(before, alo, ahi)]
+ row_prefix = "(removed) " if include_change_type_prefix else ''
+ g = [ row_prefix + i for i in same_slicer(before, alo, ahi)]
yield g
- elif tag == 'replace':
- g = ["(changed) " + i for i in same_slicer(before, alo, ahi)]
- g += ["(into) " + i for i in same_slicer(after, blo, bhi)]
+ elif include_replaced and tag == 'replace':
+ row_prefix = "(changed) " if include_change_type_prefix else ''
+ g = [row_prefix + i for i in same_slicer(before, alo, ahi)]
+ row_prefix = "(into) " if include_change_type_prefix else ''
+ g += [row_prefix + i for i in same_slicer(after, blo, bhi)]
yield g
elif include_added and tag == 'insert':
- g = ["(added) " + i for i in same_slicer(after, blo, bhi)]
+ row_prefix = "(added) " if include_change_type_prefix else ''
+ g = [row_prefix + i for i in same_slicer(after, blo, bhi)]
yield g
# only_differences - only return info about the differences, no context
# line_feed_sep could be " " or "
" or "\n" etc
-def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"):
+def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, include_replaced=True, line_feed_sep="\n", include_change_type_prefix=True):
newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()]
@@ -40,9 +44,13 @@ def render_diff(previous_version_file_contents, newest_version_file_contents, in
else:
previous_version_file_contents = ""
- rendered_diff = customSequenceMatcher(previous_version_file_contents,
- newest_version_file_contents,
- include_equal, include_removed, include_added)
+ rendered_diff = customSequenceMatcher(before=previous_version_file_contents,
+ after=newest_version_file_contents,
+ include_equal=include_equal,
+ include_removed=include_removed,
+ include_added=include_added,
+ include_replaced=include_replaced,
+ include_change_type_prefix=include_change_type_prefix)
# Recursively join lists
f = lambda L: line_feed_sep.join([f(x) if type(x) is list else x for x in L])
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 9c086e37..55566a01 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -399,13 +399,19 @@ class watchForm(commonSettingsForm):
body = TextAreaField('Request body', [validators.Optional()])
method = SelectField('Request method', choices=valid_method, default=default_method)
ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
- check_unique_lines = BooleanField('Only trigger when new lines appear', default=False)
+ check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False)
+
+ filter_text_added = BooleanField('Added lines', default=True)
+ filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
+ filter_text_removed = BooleanField('Removed lines', default=True)
+
+ # @todo this class could be moved to its own text_json_diff_watchForm and this goes to restock_diff_Watchform perhaps
in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True)
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
- text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
+ text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()])
webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index 9af922ae..c2b48c2e 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -26,6 +26,9 @@ base_config = {
'fetch_backend': 'system', # plaintext, playwright etc
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
+ 'filter_text_added': True,
+ 'filter_text_replaced': True,
+ 'filter_text_removed': True,
'has_ldjson_price_data': None,
'track_ldjson_price_data': None,
'headers': {}, # Extra headers to send
@@ -326,7 +329,8 @@ class model(dict):
# Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({})
for k, v in self.history.items():
- alist = set([line.decode('utf-8').strip().lower() for line in open(v, 'rb')])
+ content = self.get_history_snapshot(k)
+ alist = set([line.strip().lower() for line in content.splitlines()])
existing_history = existing_history.union(alist)
# Check that everything in local_lines(new stuff) already exists in existing_history - it should
@@ -454,3 +458,38 @@ class model(dict):
# Return list of tags, stripped and lowercase, used for searching
def all_tags(self):
return [s.strip().lower() for s in self.get('tag','').split(',')]
+
+ def has_special_diff_filter_options_set(self):
+
+ # All False - nothing would be done, so act like it's not processable
+ if not self.get('filter_text_added', True) and not self.get('filter_text_replaced', True) and not self.get('filter_text_removed', True):
+ return False
+
+ # Or one is set
+ if not self.get('filter_text_added', True) or not self.get('filter_text_replaced', True) or not self.get('filter_text_removed', True):
+ return True
+
+ # None is set
+ return False
+
+
+ def get_last_fetched_before_filters(self):
+ import brotli
+ filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
+
+ if not os.path.isfile(filepath):
+ # If a previous attempt doesnt yet exist, just snarf the previous snapshot instead
+ dates = list(self.history.keys())
+ if len(dates):
+ return self.get_history_snapshot(dates[-1])
+ else:
+ return ''
+
+ with open(filepath, 'rb') as f:
+ return(brotli.decompress(f.read()).decode('utf-8'))
+
+ def save_last_fetched_before_filters(self, contents):
+ import brotli
+ filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
+ with open(filepath, 'wb') as f:
+ f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 14ce14f3..cf85522a 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -279,6 +279,34 @@ class perform_site_check(difference_detection_processor):
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
+
+ # @todo whitespace coming from missing rtrim()?
+ # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
+ # Rewrite's the processing text based on only what diff result they want to see
+ if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
+ # Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences
+ from .. import diff
+ # needs to not include (added) etc or it may get used twice
+ # Replace the processed text with the preferred result
+ rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
+ newest_version_file_contents=stripped_text_from_html,
+ include_equal=False, # not the same lines
+ include_added=watch.get('filter_text_added', True),
+ include_removed=watch.get('filter_text_removed', True),
+ include_replaced=watch.get('filter_text_replaced', True),
+ line_feed_sep="\n",
+ include_change_type_prefix=False)
+
+ watch.save_last_fetched_before_filters(text_content_before_ignored_filter)
+
+ if not rendered_diff and stripped_text_from_html:
+ # We had some content, but no differences were found
+ # Store our new file as the MD5 so it will trigger in the future
+ c = hashlib.md5(text_content_before_ignored_filter.translate(None, b'\r\n\t ')).hexdigest()
+ return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
+ else:
+ stripped_text_from_html = rendered_diff
+
# Treat pages with no renderable text content as a change? No by default
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
@@ -337,6 +365,7 @@ class perform_site_check(difference_detection_processor):
blocked = True
# Filter and trigger works the same, so reuse it
# It should return the line numbers that match
+ # Unblock flow if the trigger was found (some text remained after stripped what didnt match)
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
wordlist=trigger_text,
mode="line numbers")
diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh
index 70184051..d9fa9ff0 100755
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -35,3 +35,4 @@ pytest tests/test_access_control.py
pytest tests/test_notification.py
pytest tests/test_backend.py
pytest tests/test_rss.py
+pytest tests/test_unique_lines.py
\ No newline at end of file
diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss
index 4ed3412f..4ec31ed3 100644
--- a/changedetectionio/static/styles/scss/styles.scss
+++ b/changedetectionio/static/styles/scss/styles.scss
@@ -893,6 +893,21 @@ body.full-width {
font-size: .875em;
}
}
+ .text-filtering {
+ h3 {
+ margin-top: 0;
+ }
+ border: 1px solid #ccc;
+ padding: 1rem;
+ border-radius: 5px;
+ margin-bottom: 1rem;
+ fieldset:last-of-type {
+ padding-bottom: 0;
+ .pure-control-group {
+ padding-bottom: 0;
+ }
+ }
+ }
}
ul {
diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css
index c8497847..1e0559f4 100644
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@@ -872,6 +872,17 @@ body.full-width .edit-form {
color: var(--color-text-input-description); }
.edit-form .pure-form-message-inline code {
font-size: .875em; }
+ .edit-form .text-filtering {
+ border: 1px solid #ccc;
+ padding: 1rem;
+ border-radius: 5px;
+ margin-bottom: 1rem; }
+ .edit-form .text-filtering h3 {
+ margin-top: 0; }
+ .edit-form .text-filtering fieldset:last-of-type {
+ padding-bottom: 0; }
+ .edit-form .text-filtering fieldset:last-of-type .pure-control-group {
+ padding-bottom: 0; }
ul {
padding-left: 1em;
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 298b8c90..b8f0a747 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -244,12 +244,6 @@ User-Agent: wonderbra 1.0") }}
-
{% set field = render_field(form.include_filters,
rows=5,
@@ -287,37 +281,39 @@ xpath://body/div/span[contains(@class, 'example-class')]",
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help.
A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"
+
Something irrelevant
+
+
+ """
+
+ if excluding:
+ output = ""
+ for i in test_return_data.splitlines():
+ if not excluding in i:
+ output += f"{i}\n"
+
+ test_return_data = output
+
+ with open("test-datastore/endpoint-content.txt", "w") as f:
+ f.write(test_return_data)
+
+
+def test_check_removed_line_contains_trigger(client, live_server):
+ live_server_setup(live_server)
+
+ sleep_time_for_fetch_thread = 3
+
+ # Give the endpoint time to spin up
+ time.sleep(1)
+ set_original()
+ # Add our URL to the import page
+ test_url = url_for('test_endpoint', _external=True)
+ res = client.post(
+ url_for("import_page"),
+ data={"urls": test_url},
+ follow_redirects=True
+ )
+ assert b"1 Imported" in res.data
+
+ # Give the thread time to pick it up
+ time.sleep(sleep_time_for_fetch_thread)
+
+ # Goto the edit page, add our ignore text
+ # Add our URL to the import page
+ res = client.post(
+ url_for("edit_page", uuid="first"),
+ data={"trigger_text": 'The golden line',
+ "url": test_url,
+ 'fetch_backend': "html_requests",
+ 'filter_text_removed': 'y'},
+ follow_redirects=True
+ )
+ assert b"Updated watch." in res.data
+ time.sleep(sleep_time_for_fetch_thread)
+ set_original(excluding='Something irrelevant')
+
+ # A line thats not the trigger should not trigger anything
+ res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
+ assert b'1 watches queued for rechecking.' in res.data
+ time.sleep(sleep_time_for_fetch_thread)
+ res = client.get(url_for("index"))
+ assert b'unviewed' not in res.data
+
+ # The trigger line is REMOVED, this should trigger
+ set_original(excluding='The golden line')
+ client.get(url_for("form_watch_checknow"), follow_redirects=True)
+ time.sleep(sleep_time_for_fetch_thread)
+ res = client.get(url_for("index"))
+ assert b'unviewed' in res.data
+
+
+ # Now add it back, and we should not get a trigger
+ client.get(url_for("mark_all_viewed"), follow_redirects=True)
+ set_original(excluding=None)
+ client.get(url_for("form_watch_checknow"), follow_redirects=True)
+ time.sleep(sleep_time_for_fetch_thread)
+ res = client.get(url_for("index"))
+ assert b'unviewed' not in res.data
+
+ # Remove it again, and we should get a trigger
+ set_original(excluding='The golden line')
+ client.get(url_for("form_watch_checknow"), follow_redirects=True)
+ time.sleep(sleep_time_for_fetch_thread)
+ res = client.get(url_for("index"))
+ assert b'unviewed' in res.data
+
+ res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+ assert b'Deleted' in res.data
diff --git a/changedetectionio/tests/test_block_while_text_present.py b/changedetectionio/tests/test_block_while_text_present.py
index 0f38bf36..0538060b 100644
--- a/changedetectionio/tests/test_block_while_text_present.py
+++ b/changedetectionio/tests/test_block_while_text_present.py
@@ -87,7 +87,10 @@ def test_check_block_changedetection_text_NOT_present(client, live_server):
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
- data={"text_should_not_be_present": ignore_text, "url": test_url, 'fetch_backend': "html_requests"},
+ data={"text_should_not_be_present": ignore_text,
+ "url": test_url,
+ 'fetch_backend': "html_requests"
+ },
follow_redirects=True
)
assert b"Updated watch." in res.data
@@ -129,7 +132,6 @@ def test_check_block_changedetection_text_NOT_present(client, live_server):
set_modified_response_minus_block_text()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
-
res = client.get(url_for("index"))
assert b'unviewed' in res.data
diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py
index 6fb2e420..e7132ca5 100644
--- a/changedetectionio/tests/test_unique_lines.py
+++ b/changedetectionio/tests/test_unique_lines.py
@@ -94,7 +94,6 @@ def test_unique_lines_functionality(client, live_server):
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
-
# Now set the content which contains the new text and re-ordered existing text
set_modified_with_trigger_text_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
diff --git a/changedetectionio/tests/unit/test_notification_diff.py b/changedetectionio/tests/unit/test_notification_diff.py
index a6b7067f..6f323146 100755
--- a/changedetectionio/tests/unit/test_notification_diff.py
+++ b/changedetectionio/tests/unit/test_notification_diff.py
@@ -19,8 +19,12 @@ class TestDiffBuilder(unittest.TestCase):
with open(base_dir + "/test-content/after.txt", 'r') as f:
newest_version_file_contents = f.read()
- output = diff.render_diff(previous_version_file_contents, newest_version_file_contents)
+ output = diff.render_diff(previous_version_file_contents=previous_version_file_contents,
+ newest_version_file_contents=newest_version_file_contents)
+
output = output.split("\n")
+
+
self.assertIn('(changed) ok', output)
self.assertIn('(into) xok', output)
self.assertIn('(into) next-x-ok', output)
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index da570b58..85bb0ff6 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -322,6 +322,7 @@ class update_worker(threading.Thread):
self.cleanup_error_artifacts(uuid)
+ #
# Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc
if process_changedetection_results:
try: