From 2b948c15c10a089b8a843f345c76c1ee25df7de9 Mon Sep 17 00:00:00 2001
From: dgtlmoon
"
+PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@@ -17,7 +18,23 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
class JSONNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
-
+
+
+# Doesn't look like python supports forward slash auto enclosure in re.findall
+# So convert it to inline flag "(?i)foobar" type configuration
+def perl_style_slash_enclosed_regex_to_options(regex):
+
+ res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
+
+ if res:
+ flags = res.group(2) if res.group(2) else 'i'
+ regex = f"(?{flags}){res.group(1)}"
+ else:
+ # Fall back to just ignorecase as an option
+ regex = f"(?i){regex}"
+
+ return regex
+
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
soup = BeautifulSoup(html_content, "html.parser")
@@ -195,23 +212,14 @@ def strip_ignore_text(content, wordlist, mode="content"):
output = []
ignore_text = []
ignore_regex = []
-
ignored_line_numbers = []
for k in wordlist:
# Is it a regex?
- x = re.search('^\/(.*)\/(.*)', k.strip())
- if x:
- # Starts with / but doesn't look like a regex
- p = x.group(1)
- try:
- # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
- ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
- except Exception as e:
- # Badly formed regex, treat as text
- ignore_text.append(k.strip())
+ res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
+ if res:
+ ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
else:
- # Had a / but doesn't work as regex
ignore_text.append(k.strip())
for line in content.splitlines():
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 5e69a591..19ef78da 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from copy import deepcopy
from . import difference_detection_processor
+from ..html_tools import PERL_STYLE_REGEX
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-name = 'Webpage Text/HTML, JSON and PDF changes'
+name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible'
+
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
+
class PDFToHTMLToolNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
@@ -37,19 +39,6 @@ class perform_site_check(difference_detection_processor):
super().__init__(*args, **kwargs)
self.datastore = datastore
- # Doesn't look like python supports forward slash auto enclosure in re.findall
- # So convert it to inline flag "foobar(?i)" type configuration
- def forward_slash_enclosed_regex_to_options(self, regex):
- res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
-
- if res:
- regex = res.group(1)
- regex += '(?{})'.format(res.group(2))
- else:
- regex += '(?{})'.format('i')
-
- return regex
-
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
changed_detected = False
screenshot = False # as bytes
@@ -135,7 +124,8 @@ class perform_site_check(difference_detection_processor):
# requests for PDF's, images etc should be passwd the is_binary flag
is_binary = watch.is_pdf
- fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
+ fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
+ is_binary=is_binary)
fetcher.quit()
self.screenshot = fetcher.screenshot
@@ -151,7 +141,6 @@ class perform_site_check(difference_detection_processor):
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
-
# Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base?
@@ -231,8 +220,6 @@ class perform_site_check(difference_detection_processor):
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
is_html = False
-
-
if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@@ -283,7 +270,6 @@ class perform_site_check(difference_detection_processor):
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
-
# @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
# Rewrite's the processing text based on only what diff result they want to see
@@ -293,13 +279,13 @@ class perform_site_check(difference_detection_processor):
# needs to not include (added) etc or it may get used twice
# Replace the processed text with the preferred result
rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
- newest_version_file_contents=stripped_text_from_html,
- include_equal=False, # not the same lines
- include_added=watch.get('filter_text_added', True),
- include_removed=watch.get('filter_text_removed', True),
- include_replaced=watch.get('filter_text_replaced', True),
- line_feed_sep="\n",
- include_change_type_prefix=False)
+ newest_version_file_contents=stripped_text_from_html,
+ include_equal=False, # not the same lines
+ include_added=watch.get('filter_text_added', True),
+ include_removed=watch.get('filter_text_removed', True),
+ include_replaced=watch.get('filter_text_replaced', True),
+ line_feed_sep="\n",
+ include_change_type_prefix=False)
watch.save_last_fetched_before_filters(text_content_before_ignored_filter)
@@ -340,16 +326,25 @@ class perform_site_check(difference_detection_processor):
regex_matched_output = []
for s_re in extract_text:
# incase they specified something in '/.../x'
- regex = self.forward_slash_enclosed_regex_to_options(s_re)
- result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
-
- for l in result:
- if type(l) is tuple:
- # @todo - some formatter option default (between groups)
- regex_matched_output += list(l) + [b'\n']
- else:
- # @todo - some formatter option default (between each ungrouped result)
- regex_matched_output += [l] + [b'\n']
+ if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
+ regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
+ result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+
+ for l in result:
+ if type(l) is tuple:
+ # @todo - some formatter option default (between groups)
+ regex_matched_output += list(l) + [b'\n']
+ else:
+ # @todo - some formatter option default (between each ungrouped result)
+ regex_matched_output += [l] + [b'\n']
+ else:
+ # Doesnt look like regex, just hunt for plaintext and return that which matches
+ # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
+ r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
+ res = r.findall(stripped_text_from_html)
+ if res:
+ for match in res:
+ regex_matched_output += [match] + [b'\n']
# Now we will only show what the regex matched
stripped_text_from_html = b''
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index a6302b30..73d5cac8 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -378,15 +378,16 @@ Unavailable") }}
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py
index fec939f1..a96b443f 100644
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -2,7 +2,7 @@
import time
from flask import url_for
-from .util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks
from ..html_tools import *
@@ -55,6 +55,8 @@ def set_multiline_response():