diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 671c96c6..8df14f32 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -7,13 +7,14 @@ from typing import List import json import re + # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' # 'price' , 'lowPrice', 'highPrice' are usually under here -# all of those may or may not appear on different websites -LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" +# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here +LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"] class JSONNotFound(ValueError): def __init__(self, msg): @@ -161,7 +162,6 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None # Foreach blob.. just return the first that matches json_filter # As a last resort, try to parse the whole - s = [] soup = BeautifulSoup(content, 'html.parser') if ensure_is_ldjson_info_type: @@ -187,13 +187,24 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None for json_data in bs_jsons: stripped_text_from_html = _parse_json(json_data, json_filter) + if ensure_is_ldjson_info_type: # Could sometimes be list, string or something else random if isinstance(json_data, dict): # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) - if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: - break + # @type could also be a list (Product, SubType) + # LD_JSON auto-extract also requires some content PLUS the ldjson to be present + # 1833 - could be either str or dict, should not be anything else + if json_data.get('@type') and stripped_text_from_html: + try: + if json_data.get('@type') == str or json_data.get('@type') == dict: + types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type') + if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]: + break + except: + continue + elif stripped_text_from_html: break @@ -283,9 +294,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: # Does LD+JSON exist with a @type=='product' and a .price set anywhere? def has_ldjson_product_info(content): + pricing_data = '' + try: - pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product") - except JSONNotFound as e: + if not 'application/ld+json' in content: + return False + + for filter in LD_JSON_PRODUCT_OFFER_SELECTORS: + pricing_data += extract_json_as_string(content=content, + json_filter=filter, + ensure_is_ldjson_info_type="product") + + except Exception as e: # Totally fine return False x=bool(pricing_data) diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 19ef78da..bada0a1d 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -17,7 +17,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Webpage Text/HTML, JSON and PDF changes' description = 'Detects all text changes where possible' - +json_filter_prefixes = ['json:', 'jq:'] class FilterNotFoundInResponse(ValueError): def __init__(self, msg): @@ -196,7 +196,7 @@ class perform_site_check(difference_detection_processor): # Inject a virtual LD+JSON price tracker rule if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT: - include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR) + include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip()) has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip()) @@ -214,7 +214,6 @@ class perform_site_check(difference_detection_processor): pass if has_filter_rule: - json_filter_prefixes = ['json:', 'jq:'] for filter in include_filters_rule: if any(prefix in filter for prefix in json_filter_prefixes): stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) diff --git a/changedetectionio/tests/test_automatic_follow_ldjson_price.py b/changedetectionio/tests/test_automatic_follow_ldjson_price.py index c95e8fcf..ff1e6330 100644 --- a/changedetectionio/tests/test_automatic_follow_ldjson_price.py +++ b/changedetectionio/tests/test_automatic_follow_ldjson_price.py @@ -2,7 +2,8 @@ import time from flask import url_for -from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI +from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI, wait_for_all_checks + def set_response_with_ldjson(): test_return_data = """ @@ -27,7 +28,7 @@ def set_response_with_ldjson(): "description":"You dont need it", "mpn":"111111", "sku":"22222", - "offers":{ + "Offers":{ "@type":"AggregateOffer", "lowPrice":8097000, "highPrice":8099900, @@ -75,12 +76,11 @@ def set_response_without_ldjson(): f.write(test_return_data) return None -# actually only really used by the distll.io importer, but could be handy too -def test_check_ldjson_price_autodetect(client, live_server): +def test_setup(client, live_server): live_server_setup(live_server) - # Give the endpoint time to spin up - time.sleep(1) +# actually only really used by the distll.io importer, but could be handy too +def test_check_ldjson_price_autodetect(client, live_server): set_response_with_ldjson() @@ -92,7 +92,7 @@ def test_check_ldjson_price_autodetect(client, live_server): follow_redirects=True ) assert b"1 Imported" in res.data - time.sleep(3) + wait_for_all_checks(client) # Should get a notice that it's available res = client.get(url_for("index")) @@ -102,11 +102,11 @@ def test_check_ldjson_price_autodetect(client, live_server): uuid = extract_UUID_from_client(client) client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True)) - time.sleep(2) + wait_for_all_checks(client) # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) - time.sleep(2) + wait_for_all_checks(client) # Offer should be gone res = client.get(url_for("index")) assert b'Embedded price data' not in res.data @@ -138,9 +138,97 @@ def test_check_ldjson_price_autodetect(client, live_server): follow_redirects=True ) assert b"1 Imported" in res.data - time.sleep(3) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'ldjson-price-track-offer' not in res.data ########################################################################################## client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + + +def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data): + + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + for k,v in client.application.config.get('DATASTORE').data['watching'].items(): + assert v.get('last_error') == False + assert v.get('has_ldjson_price_data') == has_ldjson_price_data + + + ########################################################################################## + client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + + +def test_bad_ldjson_is_correctly_ignored(client, live_server): + #live_server_setup(live_server) + test_return_data = """ + + + + + +
Some extra stuff
+ + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=True) + test_return_data = """ + + + + + +
Some extra stuff
+ + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=False) +