|
|
@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools
|
|
|
|
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
|
|
|
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
|
|
|
|
from copy import deepcopy
|
|
|
|
from copy import deepcopy
|
|
|
|
from . import difference_detection_processor
|
|
|
|
from . import difference_detection_processor
|
|
|
|
|
|
|
|
from ..html_tools import PERL_STYLE_REGEX
|
|
|
|
|
|
|
|
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name = 'Webpage Text/HTML, JSON and PDF changes'
|
|
|
|
name = 'Webpage Text/HTML, JSON and PDF changes'
|
|
|
|
description = 'Detects all text changes where possible'
|
|
|
|
description = 'Detects all text changes where possible'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FilterNotFoundInResponse(ValueError):
|
|
|
|
class FilterNotFoundInResponse(ValueError):
|
|
|
|
def __init__(self, msg):
|
|
|
|
def __init__(self, msg):
|
|
|
|
ValueError.__init__(self, msg)
|
|
|
|
ValueError.__init__(self, msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFToHTMLToolNotFound(ValueError):
|
|
|
|
class PDFToHTMLToolNotFound(ValueError):
|
|
|
|
def __init__(self, msg):
|
|
|
|
def __init__(self, msg):
|
|
|
|
ValueError.__init__(self, msg)
|
|
|
|
ValueError.__init__(self, msg)
|
|
|
@ -37,19 +39,6 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.datastore = datastore
|
|
|
|
self.datastore = datastore
|
|
|
|
|
|
|
|
|
|
|
|
# Doesn't look like python supports forward slash auto enclosure in re.findall
|
|
|
|
|
|
|
|
# So convert it to inline flag "foobar(?i)" type configuration
|
|
|
|
|
|
|
|
def forward_slash_enclosed_regex_to_options(self, regex):
|
|
|
|
|
|
|
|
res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if res:
|
|
|
|
|
|
|
|
regex = res.group(1)
|
|
|
|
|
|
|
|
regex += '(?{})'.format(res.group(2))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
regex += '(?{})'.format('i')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return regex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
|
|
|
|
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
|
|
|
|
changed_detected = False
|
|
|
|
changed_detected = False
|
|
|
|
screenshot = False # as bytes
|
|
|
|
screenshot = False # as bytes
|
|
|
@ -135,7 +124,8 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
# requests for PDF's, images etc should be passwd the is_binary flag
|
|
|
|
# requests for PDF's, images etc should be passwd the is_binary flag
|
|
|
|
is_binary = watch.is_pdf
|
|
|
|
is_binary = watch.is_pdf
|
|
|
|
|
|
|
|
|
|
|
|
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
|
|
|
|
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
|
|
|
|
|
|
|
|
is_binary=is_binary)
|
|
|
|
fetcher.quit()
|
|
|
|
fetcher.quit()
|
|
|
|
|
|
|
|
|
|
|
|
self.screenshot = fetcher.screenshot
|
|
|
|
self.screenshot = fetcher.screenshot
|
|
|
@ -151,7 +141,6 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
|
|
|
|
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
|
|
|
|
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
|
|
|
|
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Fetching complete, now filters
|
|
|
|
# Fetching complete, now filters
|
|
|
|
# @todo move to class / maybe inside of fetcher abstract base?
|
|
|
|
# @todo move to class / maybe inside of fetcher abstract base?
|
|
|
|
|
|
|
|
|
|
|
@ -231,8 +220,6 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
|
|
|
|
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
|
|
|
|
is_html = False
|
|
|
|
is_html = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if is_html or is_source:
|
|
|
|
if is_html or is_source:
|
|
|
|
|
|
|
|
|
|
|
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
|
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
|
@ -283,7 +270,6 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
# Re #340 - return the content before the 'ignore text' was applied
|
|
|
|
# Re #340 - return the content before the 'ignore text' was applied
|
|
|
|
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
|
|
|
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# @todo whitespace coming from missing rtrim()?
|
|
|
|
# @todo whitespace coming from missing rtrim()?
|
|
|
|
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
|
|
|
|
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
|
|
|
|
# Rewrite's the processing text based on only what diff result they want to see
|
|
|
|
# Rewrite's the processing text based on only what diff result they want to see
|
|
|
@ -340,7 +326,8 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
regex_matched_output = []
|
|
|
|
regex_matched_output = []
|
|
|
|
for s_re in extract_text:
|
|
|
|
for s_re in extract_text:
|
|
|
|
# incase they specified something in '/.../x'
|
|
|
|
# incase they specified something in '/.../x'
|
|
|
|
regex = self.forward_slash_enclosed_regex_to_options(s_re)
|
|
|
|
if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
|
|
|
|
|
|
|
|
regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
|
|
|
|
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
|
|
|
|
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
|
|
|
|
|
|
|
|
|
|
|
|
for l in result:
|
|
|
|
for l in result:
|
|
|
@ -350,6 +337,14 @@ class perform_site_check(difference_detection_processor):
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
# @todo - some formatter option default (between each ungrouped result)
|
|
|
|
# @todo - some formatter option default (between each ungrouped result)
|
|
|
|
regex_matched_output += [l] + [b'\n']
|
|
|
|
regex_matched_output += [l] + [b'\n']
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# Doesnt look like regex, just hunt for plaintext and return that which matches
|
|
|
|
|
|
|
|
# `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
|
|
|
|
|
|
|
|
r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
|
|
|
|
|
|
|
|
res = r.findall(stripped_text_from_html)
|
|
|
|
|
|
|
|
if res:
|
|
|
|
|
|
|
|
for match in res:
|
|
|
|
|
|
|
|
regex_matched_output += [match] + [b'\n']
|
|
|
|
|
|
|
|
|
|
|
|
# Now we will only show what the regex matched
|
|
|
|
# Now we will only show what the regex matched
|
|
|
|
stripped_text_from_html = b''
|
|
|
|
stripped_text_from_html = b''
|
|
|
|