Try handle two different cases of Offers detecting

1833-ldjson-fix
dgtlmoon 1 year ago
parent 9e9f9d30c8
commit e86e178203

@ -7,13 +7,14 @@ from typing import List
import json import json
import re import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here # 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
class JSONNotFound(ValueError): class JSONNotFound(ValueError):
def __init__(self, msg): def __init__(self, msg):
@ -293,14 +294,17 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
# Does LD+JSON exist with a @type=='product' and a .price set anywhere? # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content): def has_ldjson_product_info(content):
pricing_data = ''
try: try:
if not 'application/ld+json' in content: if not 'application/ld+json' in content:
return False return False
# Always lowercase the content so the json_filter for finding $..offers matches for filter in LD_JSON_PRODUCT_OFFER_SELECTORS:
pricing_data = extract_json_as_string(content=content.lower(), pricing_data += extract_json_as_string(content=content,
json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, json_filter=filter,
ensure_is_ldjson_info_type="product") ensure_is_ldjson_info_type="product")
except Exception as e: except Exception as e:
# Totally fine # Totally fine
return False return False

@ -17,7 +17,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes' name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible' description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:']
class FilterNotFoundInResponse(ValueError): class FilterNotFoundInResponse(ValueError):
def __init__(self, msg): def __init__(self, msg):
@ -196,7 +196,7 @@ class perform_site_check(difference_detection_processor):
# Inject a virtual LD+JSON price tracker rule # Inject a virtual LD+JSON price tracker rule
if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT: if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR) include_filters_rule += html_tools.LD_JSON_PRODUCT_OFFER_SELECTORS
has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip()) has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip()) has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
@ -214,7 +214,6 @@ class perform_site_check(difference_detection_processor):
pass pass
if has_filter_rule: if has_filter_rule:
json_filter_prefixes = ['json:', 'jq:']
for filter in include_filters_rule: for filter in include_filters_rule:
if any(prefix in filter for prefix in json_filter_prefixes): if any(prefix in filter for prefix in json_filter_prefixes):
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)

@ -28,7 +28,7 @@ def set_response_with_ldjson():
"description":"You dont need it", "description":"You dont need it",
"mpn":"111111", "mpn":"111111",
"sku":"22222", "sku":"22222",
"oFFerS":{ "Offers":{
"@type":"AggregateOffer", "@type":"AggregateOffer",
"lowPrice":8097000, "lowPrice":8097000,
"highPrice":8099900, "highPrice":8099900,
@ -177,7 +177,7 @@ def test_bad_ldjson_is_correctly_ignored(client, live_server):
"@type": ["Product", "SubType"], "@type": ["Product", "SubType"],
"name": "My test product", "name": "My test product",
"description": "", "description": "",
"OffeRS": { "offers": {
"note" : "You can see the case-insensitive OffERS key, it should work", "note" : "You can see the case-insensitive OffERS key, it should work",
"@type": "Offer", "@type": "Offer",
"offeredBy": { "offeredBy": {

Loading…
Cancel
Save