regex-cleanup-311
dgtlmoon 1 year ago
parent 8c8f378395
commit fc38e30989

@ -10,6 +10,7 @@ import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]?)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here # 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites # all of those may or may not appear on different websites
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@ -18,6 +19,20 @@ class JSONNotFound(ValueError):
def __init__(self, msg): def __init__(self, msg):
ValueError.__init__(self, msg) ValueError.__init__(self, msg)
# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "(?i)foobar" type configuration
def perl_style_slash_enclosed_regex_to_options(regex):
res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
if res:
flags = res.group(2) if res.group(2) else 'i'
regex = f"(?{flags}){res.group(1)}"
else:
regex = f"(?i){regex}"
return regex
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False): def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
@ -195,23 +210,14 @@ def strip_ignore_text(content, wordlist, mode="content"):
output = [] output = []
ignore_text = [] ignore_text = []
ignore_regex = [] ignore_regex = []
ignored_line_numbers = [] ignored_line_numbers = []
for k in wordlist: for k in wordlist:
# Is it a regex? # Is it a regex?
x = re.search('^\/(.*)\/(.*)', k.strip()) res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
if x: if res:
# Starts with / but doesn't look like a regex ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
p = x.group(1)
try:
# @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
except Exception as e:
# Badly formed regex, treat as text
ignore_text.append(k.strip())
else: else:
# Had a / but doesn't work as regex
ignore_text.append(k.strip()) ignore_text.append(k.strip())
for line in content.splitlines(): for line in content.splitlines():

@ -37,19 +37,6 @@ class perform_site_check(difference_detection_processor):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.datastore = datastore self.datastore = datastore
# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "(?i)foobar" type configuration
def forward_slash_enclosed_regex_to_options(self, regex):
res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
if res:
regex = res.group(1)
regex = f"(?{res.group(2)}){regex}"
else:
regex = f"(?i){regex}"
return regex
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
changed_detected = False changed_detected = False
screenshot = False # as bytes screenshot = False # as bytes
@ -340,7 +327,7 @@ class perform_site_check(difference_detection_processor):
regex_matched_output = [] regex_matched_output = []
for s_re in extract_text: for s_re in extract_text:
# incase they specified something in '/.../x' # incase they specified something in '/.../x'
regex = self.forward_slash_enclosed_regex_to_options(s_re) regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
result = re.findall(regex.encode('utf-8'), stripped_text_from_html) result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
for l in result: for l in result:

@ -2,7 +2,7 @@
import time import time
from flask import url_for from flask import url_for
from . util import live_server_setup from .util import live_server_setup, wait_for_all_checks
def set_original_ignore_response(): def set_original_ignore_response():
@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server):
live_server_setup(live_server) live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_ignore_response() set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page # Add our URL to the import page
test_url = url_for('test_endpoint', _external=True) test_url = url_for('test_endpoint', _external=True)
res = client.post( res = client.post(
@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server):
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread) wait_for_all_checks(client)
# It should report nothing found (just a new one shouldnt have anything) # It should report nothing found (just a new one shouldnt have anything)
res = client.get(url_for("index")) res = client.get(url_for("index"))
@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server):
"fetch_backend": "html_requests"}, "fetch_backend": "html_requests"},
follow_redirects=True follow_redirects=True
) )
time.sleep(sleep_time_for_fetch_thread) wait_for_all_checks(client)
# so that we set the state to 'unviewed' after all the edits # so that we set the state to 'unviewed' after all the edits
client.get(url_for("diff_history_page", uuid="first")) client.get(url_for("diff_history_page", uuid="first"))
@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server):
f.write("some new noise") f.write("some new noise")
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread) wait_for_all_checks(client)
# It should report nothing found (nothing should match the regex) # It should report nothing found (nothing should match the regex)
res = client.get(url_for("index")) res = client.get(url_for("index"))
@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server):
f.write("regex test123<br>\nsomething 123") f.write("regex test123<br>\nsomething 123")
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread) wait_for_all_checks(client)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data

Loading…
Cancel
Save