From b58fd995b57a4617efe5a04758da779bbe89e797 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 8 Dec 2022 17:47:22 +0100 Subject: [PATCH] Automatically offer to track LD+JSON product price data (#1204) --- changedetectionio/__init__.py | 4 + .../blueprint/price_data_follower/__init__.py | 27 ++++ changedetectionio/fetch_site_status.py | 11 +- changedetectionio/html_tools.py | 38 ++++- changedetectionio/model/Watch.py | 2 + .../static/images/price-tag-icon.svg | 2 + .../static/styles/scss/styles.scss | 27 ++++ changedetectionio/static/styles/styles.css | 21 +++ changedetectionio/store.py | 15 +- .../templates/watch-overview.html | 10 +- .../test_automatic_follow_ldjson_price.py | 146 ++++++++++++++++++ 11 files changed, 289 insertions(+), 14 deletions(-) create mode 100644 changedetectionio/blueprint/price_data_follower/__init__.py create mode 100644 changedetectionio/static/images/price-tag-icon.svg create mode 100644 changedetectionio/tests/test_automatic_follow_ldjson_price.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 44ed9449..fbe9d09d 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None): import changedetectionio.blueprint.browser_steps as browser_steps app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps') + import changedetectionio.blueprint.price_data_follower as price_data_follower + app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower') + + # @todo handle ctrl break ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() threading.Thread(target=notification_runner).start() diff --git a/changedetectionio/blueprint/price_data_follower/__init__.py b/changedetectionio/blueprint/price_data_follower/__init__.py new file mode 100644 index 00000000..2c420618 --- /dev/null +++ b/changedetectionio/blueprint/price_data_follower/__init__.py @@ -0,0 +1,27 @@ + +from distutils.util import strtobool +from flask import Blueprint, flash, redirect, url_for +from flask_login import login_required +from changedetectionio.store import ChangeDetectionStore + +def construct_blueprint(datastore: ChangeDetectionStore): + + price_data_follower_blueprint = Blueprint('price_data_follower', __name__) + + @login_required + @price_data_follower_blueprint.route("//accept", methods=['GET']) + def accept(uuid): + datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted' + return redirect(url_for("form_watch_checknow", uuid=uuid)) + + + @login_required + @price_data_follower_blueprint.route("//reject", methods=['GET']) + def reject(uuid): + datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected' + return redirect(url_for("index")) + + + return price_data_follower_blueprint + + diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 68762f45..5397fbb0 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -2,7 +2,6 @@ import hashlib import logging import os import re -import time import urllib3 from changedetectionio import content_fetcher, html_tools @@ -140,7 +139,7 @@ class perform_site_check(): is_html = False is_json = False - include_filters_rule = watch.get('include_filters', []) + include_filters_rule = deepcopy(watch.get('include_filters', [])) # include_filters_rule = watch['include_filters'] subtractive_selectors = watch.get( "subtractive_selectors", [] @@ -148,6 +147,10 @@ class perform_site_check(): "global_subtractive_selectors", [] ) + # Inject a virtual LD+JSON price tracker rule + if watch.get('track_ldjson_price_data'): + include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR) + has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip()) has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) @@ -173,9 +176,13 @@ class perform_site_check(): # Don't run get_text or xpath/css filters on plaintext stripped_text_from_html = html_content else: + # Does it have some ld+json price data? used for easier monitoring + update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content) + # Then we assume HTML if has_filter_rule: html_content = "" + for filter_rule in include_filters_rule: # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 06b14958..206be070 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -10,6 +10,10 @@ import re # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" +# 'price' , 'lowPrice', 'highPrice' are usually under here +# all of those may or may not appear on different websites +LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" + class JSONNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) @@ -127,8 +131,10 @@ def _get_stripped_text_from_json_match(match): return stripped_text_from_html -def extract_json_as_string(content, json_filter): - +# content - json +# json_filter - ie json:$..price +# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector) +def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): stripped_text_from_html = False # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded blob.. just return the first that matches json_filter s = [] soup = BeautifulSoup(content, 'html.parser') - bs_result = soup.findAll('script') + + if ensure_is_ldjson_info_type: + bs_result = soup.findAll('script', {"type": "application/ld+json"}) + else: + bs_result = soup.findAll('script') + if not bs_result: raise JSONNotFound("No parsable JSON found in this document") @@ -156,7 +167,14 @@ def extract_json_as_string(content, json_filter): continue else: stripped_text_from_html = _parse_json(json_data, json_filter) - if stripped_text_from_html: + if ensure_is_ldjson_info_type: + # Could sometimes be list, string or something else random + if isinstance(json_data, dict): + # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search + # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) + if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: + break + elif stripped_text_from_html: break if not stripped_text_from_html: @@ -243,6 +261,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: return text_content + +# Does LD+JSON exist with a @type=='product' and a .price set anywhere? +def has_ldjson_product_info(content): + try: + pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product") + except JSONNotFound as e: + # Totally fine + return False + x=bool(pricing_data) + return x + + def workarounds_for_obfuscations(content): """ Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index c3a000b0..d1183752 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -26,6 +26,8 @@ class model(dict): 'extract_title_as_title': False, 'fetch_backend': None, 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), + 'has_ldjson_price_data': None, + 'track_ldjson_price_data': None, 'headers': {}, # Extra headers to send 'ignore_text': [], # List of text to ignore when calculating the comparison checksum 'include_filters': [], diff --git a/changedetectionio/static/images/price-tag-icon.svg b/changedetectionio/static/images/price-tag-icon.svg new file mode 100644 index 00000000..f58b1c17 --- /dev/null +++ b/changedetectionio/static/images/price-tag-icon.svg @@ -0,0 +1,2 @@ + + diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss index 3bfa5a12..69310fa4 100644 --- a/changedetectionio/static/styles/scss/styles.scss +++ b/changedetectionio/static/styles/scss/styles.scss @@ -1009,3 +1009,30 @@ ul { border-radius: 5px; color: var(--color-warning); } + +/* automatic price following helpers */ +.tracking-ldjson-price-data { + background-color: var(--color-background-button-green); + color: #000; + padding: 3px; + border-radius: 3px; + white-space: nowrap; +} + +.ldjson-price-track-offer { + a.pure-button { + border-radius: 3px; + padding: 3px; + background-color: var(--color-background-button-green); + } + + font-weight: bold; + font-style: italic; +} + +.price-follow-tag-icon { + display: inline-block; + height: 0.8rem; + vertical-align: middle; +} + diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index 1fae1681..2d5511b4 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -945,3 +945,24 @@ ul { display: inline; height: 26px; vertical-align: middle; } + +/* automatic price following helpers */ +.tracking-ldjson-price-data { + background-color: var(--color-background-button-green); + color: #000; + padding: 3px; + border-radius: 3px; + white-space: nowrap; } + +.ldjson-price-track-offer { + font-weight: bold; + font-style: italic; } + .ldjson-price-track-offer a.pure-button { + border-radius: 3px; + padding: 3px; + background-color: var(--color-background-button-green); } + +.price-follow-tag-icon { + display: inline-block; + height: 0.8rem; + vertical-align: middle; } diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 69623980..49f0bd61 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -250,12 +250,15 @@ class ChangeDetectionStore: def clear_watch_history(self, uuid): import pathlib - self.__data['watching'][uuid].update( - {'last_checked': 0, - 'last_viewed': 0, - 'previous_md5': False, - 'last_notification_error': False, - 'last_error': False}) + self.__data['watching'][uuid].update({ + 'last_checked': 0, + 'has_ldjson_price_data': None, + 'last_error': False, + 'last_notification_error': False, + 'last_viewed': 0, + 'previous_md5': False, + 'track_ldjson_price_data': None, + }) # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"): diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 96de0b77..35c5cc95 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -88,9 +88,9 @@ {{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}} - + - {%if watch.fetch_backend == "html_webdriver" %}{% endif %} + {%if watch.fetch_backend == "html_webdriver" %}{% endif %} {% if watch.last_error is defined and watch.last_error != False %}
{{ watch.last_error }}
@@ -98,6 +98,12 @@ {% if watch.last_notification_error is defined and watch.last_notification_error != False %} {% endif %} + {% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] %} +
Embedded price data detected, follow only price data? Yes No
+ {% endif %} + {% if watch['track_ldjson_price_data'] == 'accepted' %} + Price + {% endif %} {% if not active_tag %} {{ watch.tag}} {% endif %} diff --git a/changedetectionio/tests/test_automatic_follow_ldjson_price.py b/changedetectionio/tests/test_automatic_follow_ldjson_price.py new file mode 100644 index 00000000..6f0c8ced --- /dev/null +++ b/changedetectionio/tests/test_automatic_follow_ldjson_price.py @@ -0,0 +1,146 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI + +def set_response_with_ldjson(): + test_return_data = """ + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+
Some text thats the same
+
Some text that will change
+ + + +""" + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + +def set_response_without_ldjson(): + test_return_data = """ + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+
Some text thats the same
+
Some text that will change
+ + +""" + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + +# actually only really used by the distll.io importer, but could be handy too +def test_check_ldjson_price_autodetect(client, live_server): + live_server_setup(live_server) + + # Give the endpoint time to spin up + time.sleep(1) + + set_response_with_ldjson() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + time.sleep(3) + + # Should get a notice that it's available + res = client.get(url_for("index")) + assert b'ldjson-price-track-offer' in res.data + + # Accept it + uuid = extract_UUID_from_client(client) + + client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True)) + time.sleep(2) + + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(2) + # Offer should be gone + res = client.get(url_for("index")) + assert b'Embedded price data' not in res.data + assert b'tracking-ldjson-price-data' in res.data + + # and last snapshop (via API) should be just the price + api_key = extract_api_key_from_UI(client) + res = client.get( + url_for("watchsinglehistory", uuid=uuid, timestamp='latest'), + headers={'x-api-key': api_key}, + ) + + # Should see this (dont know where the whitespace came from) + assert b'"highPrice": 8099900' in res.data + # And not this cause its not the ld-json + assert b"So let's see what happens" not in res.data + + client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + + ########################################################################################## + # And we shouldnt see the offer + set_response_without_ldjson() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + time.sleep(3) + res = client.get(url_for("index")) + assert b'ldjson-price-track-offer' not in res.data + + ########################################################################################## + client.get(url_for("form_delete", uuid="all"), follow_redirects=True) \ No newline at end of file