Automatically offer to track LD+JSON product price data (#1204)

2 years ago · b58fd995b5
parent f7bb8a0afa
commit b58fd995b5
11 changed files with 289 additions and 14 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None):
    import changedetectionio.blueprint.browser_steps as browser_steps
    app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
    import changedetectionio.blueprint.price_data_follower as price_data_follower
    app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower')
    # @todo handle ctrl break
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
    threading.Thread(target=notification_runner).start()
--- a/changedetectionio/blueprint/price_data_follower/init.py
+++ b/changedetectionio/blueprint/price_data_follower/init.py
@ -0,0 +1,27 @@
 from distutils.util import strtobool
 from flask import Blueprint, flash, redirect, url_for
 from flask_login import login_required
 from changedetectionio.store import ChangeDetectionStore
 def construct_blueprint(datastore: ChangeDetectionStore):
    price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
    @login_required
    @price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
    def accept(uuid):
        datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted'
        return redirect(url_for("form_watch_checknow", uuid=uuid))
    @login_required
    @price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
    def reject(uuid):
        datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected'
        return redirect(url_for("index"))
    return price_data_follower_blueprint
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -2,7 +2,6 @@ import hashlib
 import logging
 import os
 import re
 import time
 import urllib3
 from changedetectionio import content_fetcher, html_tools
@ -140,7 +139,7 @@ class perform_site_check():
            is_html = False
            is_json = False
-        include_filters_rule = watch.get('include_filters', [])
+        include_filters_rule = deepcopy(watch.get('include_filters', []))
        # include_filters_rule = watch['include_filters']
        subtractive_selectors = watch.get(
            "subtractive_selectors", []
@ -148,6 +147,10 @@ class perform_site_check():
            "global_subtractive_selectors", []
        )
        # Inject a virtual LD+JSON price tracker rule
        if watch.get('track_ldjson_price_data'):
            include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR)
        has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
        has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
@ -173,9 +176,13 @@ class perform_site_check():
                # Don't run get_text or xpath/css filters on plaintext
                stripped_text_from_html = html_content
            else:
                # Does it have some ld+json price data? used for easier monitoring
                update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content)
                # Then we assume HTML
                if has_filter_rule:
                    html_content = ""
                    for filter_rule in include_filters_rule:
                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
                        if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -10,6 +10,10 @@ import re
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br/>"
 # 'price' , 'lowPrice', 'highPrice' are usually under here
 # all of those may or may not appear on different websites
 LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
 class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
@ -127,8 +131,10 @@ def _get_stripped_text_from_json_match(match):
    return stripped_text_from_html
-def extract_json_as_string(content, json_filter):
+# content - json
-
+# json_filter - ie json:$..price
 # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
 def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    stripped_text_from_html = False
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
@ -139,7 +145,12 @@ def extract_json_as_string(content, json_filter):
        # Foreach <script json></script> blob.. just return the first that matches json_filter
        s = []
        soup = BeautifulSoup(content, 'html.parser')
-        bs_result = soup.findAll('script')
+
        if ensure_is_ldjson_info_type:
            bs_result = soup.findAll('script', {"type": "application/ld+json"})
        else:
            bs_result = soup.findAll('script')
        if not bs_result:
            raise JSONNotFound("No parsable JSON found in this document")
@ -156,7 +167,14 @@ def extract_json_as_string(content, json_filter):
                continue
            else:
                stripped_text_from_html = _parse_json(json_data, json_filter)
-                if stripped_text_from_html:
+                if ensure_is_ldjson_info_type:
                    # Could sometimes be list, string or something else random
                    if isinstance(json_data, dict):
                        # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
                        # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
                        if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
                            break
                elif stripped_text_from_html:
                    break
    if not stripped_text_from_html:
@ -243,6 +261,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    return text_content
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
    try:
        pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
    except JSONNotFound as e:
        # Totally fine
        return False
    x=bool(pricing_data)
    return x
 def workarounds_for_obfuscations(content):
    """
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -26,6 +26,8 @@ class model(dict):
            'extract_title_as_title': False,
            'fetch_backend': None,
            'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
            'has_ldjson_price_data': None,
            'track_ldjson_price_data': None,
            'headers': {},  # Extra headers to send
            'ignore_text': [],  # List of text to ignore when calculating the comparison checksum
            'include_filters': [],
--- a/changedetectionio/static/images/price-tag-icon.svg
+++ b/changedetectionio/static/images/price-tag-icon.svg
@ -0,0 +1,2 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <svg width="83.39" height="89.648" enable-background="new 0 0 122.406 122.881" version="1.1" viewBox="0 0 83.39 89.648" xml:space="preserve" xmlns="http://www.w3.org/2000/svg"><g transform="translate(5e-4 -33.234)"><path d="m44.239 42.946-39.111 39.896 34.908 34.91 39.09-39.876-1.149-34.931zm-0.91791 42.273c0.979-0.979 1.507-1.99 1.577-3.027 0.077-1.043-0.248-2.424-0.967-4.135-0.725-1.717-1.348-3.346-1.87-4.885s-0.814-3.014-0.897-4.432c-0.07-1.42 0.134-2.768 0.624-4.045 0.477-1.279 1.348-2.545 2.607-3.804 2.099-2.099 4.535-3.123 7.314-3.065 2.773 0.063 5.457 1.158 8.04 3.294l2.881 3.034c1.946 2.607 2.799 5.33 2.557 8.166-0.235 2.83-1.532 5.426-3.893 7.785l-6.296-6.297c1.291-1.291 2.035-2.531 2.238-3.727 0.191-1.197-0.165-2.252-1.081-3.168-0.821-0.82-1.717-1.195-2.69-1.139-0.967 0.064-1.908 0.547-2.817 1.457-0.922 0.922-1.393 1.914-1.412 2.977s0.306 2.416 0.973 4.064c0.661 1.652 1.24 3.25 1.736 4.801 0.496 1.553 0.782 3.035 0.858 4.445 0.076 1.426-0.127 2.787-0.591 4.104-0.477 1.316-1.336 2.596-2.588 3.848-2.125 2.125-4.522 3.186-7.212 3.18s-5.311-1.063-7.855-3.16l-3.747 3.746-2.964-2.965 3.766-3.764c-2.423-2.996-3.568-5.998-3.447-9.02 0.127-3.014 1.476-5.813 4.045-8.383l6.278 6.277c-1.412 1.412-2.175 2.799-2.277 4.16-0.108 1.367 0.414 2.627 1.571 3.783 0.839 0.84 1.755 1.26 2.741 1.242 0.985-0.017 1.92-0.47 2.798-1.347zm21.127-46.435h17.457c-0.0269 2.2368 0.69936 16.025 0.69936 16.025l0.785 23.858c0.019 0.609-0.221 1.164-0.619 1.564l5e-3 4e-3 -41.236 42.022c-0.82213 0.8378-2.175 0.83-3.004 0l-37.913-37.91c-0.83-0.83-0.83-2.176 0-3.006l41.236-42.021c0.39287-0.42671 1.502-0.53568 1.502-0.53568zm18.011 11.59c-59.392-29.687-29.696-14.843 0 0z"/></g></svg>
--- a/changedetectionio/static/styles/scss/styles.scss
+++ b/changedetectionio/static/styles/scss/styles.scss
@ -1009,3 +1009,30 @@ ul {
  border-radius: 5px;
  color: var(--color-warning);
 }
 /* automatic price following helpers */
 .tracking-ldjson-price-data {
  background-color: var(--color-background-button-green);
  color: #000;
  padding: 3px;
  border-radius: 3px;
  white-space: nowrap;
 }
 .ldjson-price-track-offer {
  a.pure-button {
    border-radius: 3px;
    padding: 3px;
    background-color: var(--color-background-button-green);
  }
  font-weight: bold;
  font-style: italic;
 }
 .price-follow-tag-icon {
  display: inline-block;
  height: 0.8rem;
  vertical-align: middle;
 }
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@ -945,3 +945,24 @@ ul {
    display: inline;
    height: 26px;
    vertical-align: middle; }
 /* automatic price following helpers */
 .tracking-ldjson-price-data {
  background-color: var(--color-background-button-green);
  color: #000;
  padding: 3px;
  border-radius: 3px;
  white-space: nowrap; }
 .ldjson-price-track-offer {
  font-weight: bold;
  font-style: italic; }
  .ldjson-price-track-offer a.pure-button {
    border-radius: 3px;
    padding: 3px;
    background-color: var(--color-background-button-green); }
 .price-follow-tag-icon {
  display: inline-block;
  height: 0.8rem;
  vertical-align: middle; }
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -250,12 +250,15 @@ class ChangeDetectionStore:
    def clear_watch_history(self, uuid):
        import pathlib
-        self.__data['watching'][uuid].update(
+        self.__data['watching'][uuid].update({
-            {'last_checked': 0,
+                'last_checked': 0,
-             'last_viewed': 0,
+                'has_ldjson_price_data': None,
-             'previous_md5': False,
+                'last_error': False,
-             'last_notification_error': False,
+                'last_notification_error': False,
-             'last_error': False})
+                'last_viewed': 0,
                'previous_md5': False,
                'track_ldjson_price_data': None,
            })
        # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
        for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@ -88,9 +88,9 @@
                </td>
                <td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
                    <a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
-                    <a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" /></a>
+                    <a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" title="Create a link to share watch config with others" /></a>
-                    {%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" />{% endif %}
+                    {%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />{% endif %}
                    {% if watch.last_error is defined and watch.last_error != False %}
                    <div class="fetch-error">{{ watch.last_error }}</div>
@ -98,6 +98,12 @@
                    {% if watch.last_notification_error is defined and watch.last_notification_error != False %}
                    <div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
                    {% endif %}
                    {% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data']  %}
                    <div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
                    {% endif %}
                    {% if watch['track_ldjson_price_data'] == 'accepted' %}
                    <span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}"  class="price-follow-tag-icon"/> Price</span>
                    {% endif %}
                    {% if not active_tag %}
                    <span class="watch-tag-list">{{ watch.tag}}</span>
                    {% endif %}
--- a/changedetectionio/tests/test_automatic_follow_ldjson_price.py
+++ b/changedetectionio/tests/test_automatic_follow_ldjson_price.py
@ -0,0 +1,146 @@
 #!/usr/bin/python3
 import time
 from flask import url_for
 from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
 def set_response_with_ldjson():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     <div class="sametext">Some text thats the same</div>
     <div class="changetext">Some text that will change</div>
     <script type="application/ld+json">
        {
           "@context":"https://schema.org/",
           "@type":"Product",
           "@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p",
           "name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic",
           "brand":{
              "@type":"Brand",
              "name":"APPLE"
           },
           "image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg",
           "description":"You dont need it",
           "mpn":"111111",
           "sku":"22222",
           "offers":{
              "@type":"AggregateOffer",
              "lowPrice":8097000,
              "highPrice":8099900,
              "priceCurrency":"COP",
              "offers":[
                 {
                    "@type":"Offer",
                    "price":8097000,
                    "priceCurrency":"COP",
                    "availability":"http://schema.org/InStock",
                    "sku":"102375961",
                    "itemCondition":"http://schema.org/NewCondition",
                    "seller":{
                       "@type":"Organization",
                       "name":"ajax"
                    }
                 }
              ],
              "offerCount":1
           }
        }
       </script>
     </body>
     </html>
 """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
    return None
 def set_response_without_ldjson():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     <div class="sametext">Some text thats the same</div>
     <div class="changetext">Some text that will change</div>     
     </body>
     </html>
 """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
    return None
 # actually only really used by the distll.io importer, but could be handy too
 def test_check_ldjson_price_autodetect(client, live_server):
    live_server_setup(live_server)
    # Give the endpoint time to spin up
    time.sleep(1)
    set_response_with_ldjson()
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(3)
    # Should get a notice that it's available
    res = client.get(url_for("index"))
    assert b'ldjson-price-track-offer' in res.data
    # Accept it
    uuid = extract_UUID_from_client(client)
    client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
    time.sleep(2)
    # Trigger a check
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    time.sleep(2)
    # Offer should be gone
    res = client.get(url_for("index"))
    assert b'Embedded price data' not in res.data
    assert b'tracking-ldjson-price-data' in res.data
    # and last snapshop (via API) should be just the price
    api_key = extract_api_key_from_UI(client)
    res = client.get(
        url_for("watchsinglehistory", uuid=uuid, timestamp='latest'),
        headers={'x-api-key': api_key},
    )
    # Should see this (dont know where the whitespace came from)
    assert b'"highPrice": 8099900' in res.data
    # And not this cause its not the ld-json
    assert b"So let's see what happens" not in res.data
    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    ##########################################################################################
    # And we shouldnt see the offer
    set_response_without_ldjson()
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(3)
    res = client.get(url_for("index"))
    assert b'ldjson-price-track-offer' not in res.data
    ##########################################################################################
    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
		`@ -0,0 +1,2 @@`
							`<?xml version="1.0" encoding="UTF-8"?>`
							<svg width="83.39" height="89.648" enable-background="new 0 0 122.406 122.881" version="1.1" viewBox="0 0 83.39 89.648" xml:space="preserve" xmlns="http://www.w3.org/2000/svg"><g transform="translate(5e-4 -33.234)"><path d="m44.239 42.946-39.111 39.896 34.908 34.91 39.09-39.876-1.149-34.931zm-0.91791 42.273c0.979-0.979 1.507-1.99 1.577-3.027 0.077-1.043-0.248-2.424-0.967-4.135-0.725-1.717-1.348-3.346-1.87-4.885s-0.814-3.014-0.897-4.432c-0.07-1.42 0.134-2.768 0.624-4.045 0.477-1.279 1.348-2.545 2.607-3.804 2.099-2.099 4.535-3.123 7.314-3.065 2.773 0.063 5.457 1.158 8.04 3.294l2.881 3.034c1.946 2.607 2.799 5.33 2.557 8.166-0.235 2.83-1.532 5.426-3.893 7.785l-6.296-6.297c1.291-1.291 2.035-2.531 2.238-3.727 0.191-1.197-0.165-2.252-1.081-3.168-0.821-0.82-1.717-1.195-2.69-1.139-0.967 0.064-1.908 0.547-2.817 1.457-0.922 0.922-1.393 1.914-1.412 2.977s0.306 2.416 0.973 4.064c0.661 1.652 1.24 3.25 1.736 4.801 0.496 1.553 0.782 3.035 0.858 4.445 0.076 1.426-0.127 2.787-0.591 4.104-0.477 1.316-1.336 2.596-2.588 3.848-2.125 2.125-4.522 3.186-7.212 3.18s-5.311-1.063-7.855-3.16l-3.747 3.746-2.964-2.965 3.766-3.764c-2.423-2.996-3.568-5.998-3.447-9.02 0.127-3.014 1.476-5.813 4.045-8.383l6.278 6.277c-1.412 1.412-2.175 2.799-2.277 4.16-0.108 1.367 0.414 2.627 1.571 3.783 0.839 0.84 1.755 1.26 2.741 1.242 0.985-0.017 1.92-0.47 2.798-1.347zm21.127-46.435h17.457c-0.0269 2.2368 0.69936 16.025 0.69936 16.025l0.785 23.858c0.019 0.609-0.221 1.164-0.619 1.564l5e-3 4e-3 -41.236 42.022c-0.82213 0.8378-2.175 0.83-3.004 0l-37.913-37.91c-0.83-0.83-0.83-2.176 0-3.006l41.236-42.021c0.39287-0.42671 1.502-0.53568 1.502-0.53568zm18.011 11.59c-59.392-29.687-29.696-14.843 0 0z"/></g></svg>