diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 671c96c6..8df14f32 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -7,13 +7,14 @@ from typing import List
import json
import re
+
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "
"
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here
-# all of those may or may not appear on different websites
-LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
+# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
+LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
class JSONNotFound(ValueError):
def __init__(self, msg):
@@ -161,7 +162,6 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# Foreach blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole
- s = []
soup = BeautifulSoup(content, 'html.parser')
if ensure_is_ldjson_info_type:
@@ -187,13 +187,24 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
for json_data in bs_jsons:
stripped_text_from_html = _parse_json(json_data, json_filter)
+
if ensure_is_ldjson_info_type:
# Could sometimes be list, string or something else random
if isinstance(json_data, dict):
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
- if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
- break
+ # @type could also be a list (Product, SubType)
+ # LD_JSON auto-extract also requires some content PLUS the ldjson to be present
+ # 1833 - could be either str or dict, should not be anything else
+ if json_data.get('@type') and stripped_text_from_html:
+ try:
+ if json_data.get('@type') == str or json_data.get('@type') == dict:
+ types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type')
+ if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]:
+ break
+ except:
+ continue
+
elif stripped_text_from_html:
break
@@ -283,9 +294,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
+ pricing_data = ''
+
try:
- pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
- except JSONNotFound as e:
+ if not 'application/ld+json' in content:
+ return False
+
+ for filter in LD_JSON_PRODUCT_OFFER_SELECTORS:
+ pricing_data += extract_json_as_string(content=content,
+ json_filter=filter,
+ ensure_is_ldjson_info_type="product")
+
+ except Exception as e:
# Totally fine
return False
x=bool(pricing_data)
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 19ef78da..bada0a1d 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -17,7 +17,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible'
-
+json_filter_prefixes = ['json:', 'jq:']
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
@@ -196,7 +196,7 @@ class perform_site_check(difference_detection_processor):
# Inject a virtual LD+JSON price tracker rule
if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
- include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR)
+ include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
@@ -214,7 +214,6 @@ class perform_site_check(difference_detection_processor):
pass
if has_filter_rule:
- json_filter_prefixes = ['json:', 'jq:']
for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
diff --git a/changedetectionio/tests/test_automatic_follow_ldjson_price.py b/changedetectionio/tests/test_automatic_follow_ldjson_price.py
index c95e8fcf..ff1e6330 100644
--- a/changedetectionio/tests/test_automatic_follow_ldjson_price.py
+++ b/changedetectionio/tests/test_automatic_follow_ldjson_price.py
@@ -2,7 +2,8 @@
import time
from flask import url_for
-from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
+from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI, wait_for_all_checks
+
def set_response_with_ldjson():
test_return_data = """
@@ -27,7 +28,7 @@ def set_response_with_ldjson():
"description":"You dont need it",
"mpn":"111111",
"sku":"22222",
- "offers":{
+ "Offers":{
"@type":"AggregateOffer",
"lowPrice":8097000,
"highPrice":8099900,
@@ -75,12 +76,11 @@ def set_response_without_ldjson():
f.write(test_return_data)
return None
-# actually only really used by the distll.io importer, but could be handy too
-def test_check_ldjson_price_autodetect(client, live_server):
+def test_setup(client, live_server):
live_server_setup(live_server)
- # Give the endpoint time to spin up
- time.sleep(1)
+# actually only really used by the distll.io importer, but could be handy too
+def test_check_ldjson_price_autodetect(client, live_server):
set_response_with_ldjson()
@@ -92,7 +92,7 @@ def test_check_ldjson_price_autodetect(client, live_server):
follow_redirects=True
)
assert b"1 Imported" in res.data
- time.sleep(3)
+ wait_for_all_checks(client)
# Should get a notice that it's available
res = client.get(url_for("index"))
@@ -102,11 +102,11 @@ def test_check_ldjson_price_autodetect(client, live_server):
uuid = extract_UUID_from_client(client)
client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
- time.sleep(2)
+ wait_for_all_checks(client)
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
- time.sleep(2)
+ wait_for_all_checks(client)
# Offer should be gone
res = client.get(url_for("index"))
assert b'Embedded price data' not in res.data
@@ -138,9 +138,97 @@ def test_check_ldjson_price_autodetect(client, live_server):
follow_redirects=True
)
assert b"1 Imported" in res.data
- time.sleep(3)
+ wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'ldjson-price-track-offer' not in res.data
##########################################################################################
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+
+
+def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data):
+
+ test_url = url_for('test_endpoint', _external=True)
+ res = client.post(
+ url_for("import_page"),
+ data={"urls": test_url},
+ follow_redirects=True
+ )
+ assert b"1 Imported" in res.data
+ wait_for_all_checks(client)
+
+ for k,v in client.application.config.get('DATASTORE').data['watching'].items():
+ assert v.get('last_error') == False
+ assert v.get('has_ldjson_price_data') == has_ldjson_price_data
+
+
+ ##########################################################################################
+ client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+
+
+def test_bad_ldjson_is_correctly_ignored(client, live_server):
+ #live_server_setup(live_server)
+ test_return_data = """
+
+
+
+
+
+ Some extra stuff
+
+ """
+ with open("test-datastore/endpoint-content.txt", "w") as f:
+ f.write(test_return_data)
+
+ _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=True)
+ test_return_data = """
+
+
+
+
+
+ Some extra stuff
+
+ """
+ with open("test-datastore/endpoint-content.txt", "w") as f:
+ f.write(test_return_data)
+
+ _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=False)
+