LD JSON Price followers - Handle incorrectly created LD-JSON price structures better (#1837)

1800-selenium-socks5-auth
dgtlmoon 1 year ago committed by GitHub
parent a7132b1cfc
commit ceac8c21e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,13 +7,14 @@ from typing import List
import json import json
import re import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here # 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
class JSONNotFound(ValueError): class JSONNotFound(ValueError):
def __init__(self, msg): def __init__(self, msg):
@ -161,7 +162,6 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# Foreach <script json></script> blob.. just return the first that matches json_filter # Foreach <script json></script> blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole <body> # As a last resort, try to parse the whole <body>
s = []
soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html.parser')
if ensure_is_ldjson_info_type: if ensure_is_ldjson_info_type:
@ -187,13 +187,24 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
for json_data in bs_jsons: for json_data in bs_jsons:
stripped_text_from_html = _parse_json(json_data, json_filter) stripped_text_from_html = _parse_json(json_data, json_filter)
if ensure_is_ldjson_info_type: if ensure_is_ldjson_info_type:
# Could sometimes be list, string or something else random # Could sometimes be list, string or something else random
if isinstance(json_data, dict): if isinstance(json_data, dict):
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: # @type could also be a list (Product, SubType)
# LD_JSON auto-extract also requires some content PLUS the ldjson to be present
# 1833 - could be either str or dict, should not be anything else
if json_data.get('@type') and stripped_text_from_html:
try:
if json_data.get('@type') == str or json_data.get('@type') == dict:
types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type')
if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]:
break break
except:
continue
elif stripped_text_from_html: elif stripped_text_from_html:
break break
@ -283,9 +294,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
# Does LD+JSON exist with a @type=='product' and a .price set anywhere? # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content): def has_ldjson_product_info(content):
pricing_data = ''
try: try:
pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product") if not 'application/ld+json' in content:
except JSONNotFound as e: return False
for filter in LD_JSON_PRODUCT_OFFER_SELECTORS:
pricing_data += extract_json_as_string(content=content,
json_filter=filter,
ensure_is_ldjson_info_type="product")
except Exception as e:
# Totally fine # Totally fine
return False return False
x=bool(pricing_data) x=bool(pricing_data)

@ -17,7 +17,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes' name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible' description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:']
class FilterNotFoundInResponse(ValueError): class FilterNotFoundInResponse(ValueError):
def __init__(self, msg): def __init__(self, msg):
@ -196,7 +196,7 @@ class perform_site_check(difference_detection_processor):
# Inject a virtual LD+JSON price tracker rule # Inject a virtual LD+JSON price tracker rule
if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT: if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR) include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip()) has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip()) has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
@ -214,7 +214,6 @@ class perform_site_check(difference_detection_processor):
pass pass
if has_filter_rule: if has_filter_rule:
json_filter_prefixes = ['json:', 'jq:']
for filter in include_filters_rule: for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes): if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)

@ -2,7 +2,8 @@
import time import time
from flask import url_for from flask import url_for
from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI, wait_for_all_checks
def set_response_with_ldjson(): def set_response_with_ldjson():
test_return_data = """<html> test_return_data = """<html>
@ -27,7 +28,7 @@ def set_response_with_ldjson():
"description":"You dont need it", "description":"You dont need it",
"mpn":"111111", "mpn":"111111",
"sku":"22222", "sku":"22222",
"offers":{ "Offers":{
"@type":"AggregateOffer", "@type":"AggregateOffer",
"lowPrice":8097000, "lowPrice":8097000,
"highPrice":8099900, "highPrice":8099900,
@ -75,12 +76,11 @@ def set_response_without_ldjson():
f.write(test_return_data) f.write(test_return_data)
return None return None
# actually only really used by the distll.io importer, but could be handy too def test_setup(client, live_server):
def test_check_ldjson_price_autodetect(client, live_server):
live_server_setup(live_server) live_server_setup(live_server)
# Give the endpoint time to spin up # actually only really used by the distll.io importer, but could be handy too
time.sleep(1) def test_check_ldjson_price_autodetect(client, live_server):
set_response_with_ldjson() set_response_with_ldjson()
@ -92,7 +92,7 @@ def test_check_ldjson_price_autodetect(client, live_server):
follow_redirects=True follow_redirects=True
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
time.sleep(3) wait_for_all_checks(client)
# Should get a notice that it's available # Should get a notice that it's available
res = client.get(url_for("index")) res = client.get(url_for("index"))
@ -102,11 +102,11 @@ def test_check_ldjson_price_autodetect(client, live_server):
uuid = extract_UUID_from_client(client) uuid = extract_UUID_from_client(client)
client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True)) client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
time.sleep(2) wait_for_all_checks(client)
# Trigger a check # Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
time.sleep(2) wait_for_all_checks(client)
# Offer should be gone # Offer should be gone
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'Embedded price data' not in res.data assert b'Embedded price data' not in res.data
@ -138,9 +138,97 @@ def test_check_ldjson_price_autodetect(client, live_server):
follow_redirects=True follow_redirects=True
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
time.sleep(3) wait_for_all_checks(client)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'ldjson-price-track-offer' not in res.data assert b'ldjson-price-track-offer' not in res.data
########################################################################################## ##########################################################################################
client.get(url_for("form_delete", uuid="all"), follow_redirects=True) client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
def _test_runner_check_bad_format_ignored(live_server, client, has_ldjson_price_data):
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
for k,v in client.application.config.get('DATASTORE').data['watching'].items():
assert v.get('last_error') == False
assert v.get('has_ldjson_price_data') == has_ldjson_price_data
##########################################################################################
client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
def test_bad_ldjson_is_correctly_ignored(client, live_server):
#live_server_setup(live_server)
test_return_data = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "http://schema.org",
"@type": ["Product", "SubType"],
"name": "My test product",
"description": "",
"offers": {
"note" : "You can see the case-insensitive OffERS key, it should work",
"@type": "Offer",
"offeredBy": {
"@type": "Organization",
"name":"Person",
"telephone":"+1 999 999 999"
},
"price": "1",
"priceCurrency": "EUR",
"url": "/some/url"
}
}
</script>
</head>
<body>
<div class="yes">Some extra stuff</div>
</body></html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
_test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=True)
test_return_data = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "http://schema.org",
"@type": ["Product", "SubType"],
"name": "My test product",
"description": "",
"BrokenOffers": {
"@type": "Offer",
"offeredBy": {
"@type": "Organization",
"name":"Person",
"telephone":"+1 999 999 999"
},
"price": "1",
"priceCurrency": "EUR",
"url": "/some/url"
}
}
</script>
</head>
<body>
<div class="yes">Some extra stuff</div>
</body></html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
_test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=False)

Loading…
Cancel
Save