big refactor

piwheels-rpi-crypto
dgtlmoon 6 months ago
parent 32e074da2b
commit d38bb6167b

@ -12,9 +12,10 @@ import copy
# See docs/README.md for rebuilding the docs/apidoc information
from . import api_schema
from ..model import watch_base
# Build a JSON Schema atleast partially based on our Watch model
from changedetectionio.model.Watch import base_config as watch_base_config
watch_base_config = watch_base()
schema = api_schema.build_watch_json_schema(watch_base_config)
schema_create_watch = copy.deepcopy(schema)

@ -279,17 +279,19 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
if isinstance(json_data, dict):
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
# @type could also be a list (Product, SubType)
# @type could also be a list although non-standard ("@type": ["Product", "SubType"],)
# LD_JSON auto-extract also requires some content PLUS the ldjson to be present
# 1833 - could be either str or dict, should not be anything else
if json_data.get('@type') and stripped_text_from_html:
try:
if json_data.get('@type') == str or json_data.get('@type') == dict:
types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type')
if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]:
t = json_data.get('@type')
if t and stripped_text_from_html:
if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower():
break
# The non-standard part, some have a list
elif isinstance(t, list):
if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]:
break
except:
continue
elif stripped_text_from_html:
break

@ -1,19 +1,14 @@
from .Watch import base_config
import uuid
class model(dict):
from changedetectionio.model import watch_base
def __init__(self, *arg, **kw):
class model(watch_base):
self.update(base_config)
def __init__(self, *arg, **kw):
self['uuid'] = str(uuid.uuid4())
super(model, self).__init__(*arg, **kw)
if kw.get('default'):
self.update(kw['default'])
del kw['default']
# Goes at the end so we update the default object with the initialiser
super(model, self).__init__(*arg, **kw)

@ -1,6 +1,6 @@
from changedetectionio.strtobool import strtobool
from changedetectionio.safe_jinja import render as jinja_render
from . import watch_base
import os
import re
import time
@ -15,71 +15,6 @@ SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
from changedetectionio.notification import (
default_notification_format_for_watch
)
Restock = {'in_stock': None, 'price': None, 'currency': None}
base_config = {
'body': None,
'browser_steps': [],
'browser_steps_last_error_step': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0,
'date_created': None,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
'fetch_backend': 'system', # plaintext, playwright etc
'fetch_time': 0.0,
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'filter_text_added': True,
'filter_text_replaced': True,
'filter_text_removed': True,
'has_ldjson_price_data': None,
'track_ldjson_price_data': None,
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'in_stock_only' : True, # Only trigger change on going to instock from out-of-stock
'include_filters': [],
'last_checked': 0,
'last_error': False,
'last_viewed': 0, # history key value of the last viewed via the [diff] link
'method': 'GET',
'notification_alert_count': 0,
# Custom notification content
'notification_body': None,
'notification_format': default_notification_format_for_watch,
'notification_muted': False,
'notification_title': None,
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'proxy': None, # Preferred proxy connection
'restock': Restock, # Restock/price storage
'remote_server_reply': None, # From 'server' reply header
'sort_text_alphabetically': False,
'subtractive_selectors': [],
'tag': '', # Old system of text name for a tag, to be removed
'tags': [], # list of UUIDs to App.Tags
'text_should_not_be_present': [], # Text that should not present
# Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'title': None,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': '',
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
}
def is_safe_url(test_url):
# See https://github.com/dgtlmoon/changedetection.io/issues/1358
@ -95,20 +30,17 @@ def is_safe_url(test_url):
return True
class model(dict):
class model(watch_base):
__newest_history_key = None
__history_n = 0
jitter_seconds = 0
def __init__(self, *arg, **kw):
self.update(base_config)
self.__datastore_path = kw['datastore_path']
self['uuid'] = str(uuid.uuid4())
del kw['datastore_path']
super(model, self).__init__(*arg, **kw)
if kw.get('default'):
self.update(kw['default'])
del kw['default']
@ -116,9 +48,6 @@ class model(dict):
# Be sure the cached timestamp is ready
bump = self.history
# Goes at the end so we update the default object with the initialiser
super(model, self).__init__(*arg, **kw)
@property
def viewed(self):
# Don't return viewed when last_viewed is 0 and newest_key is 0
@ -257,6 +186,16 @@ class model(dict):
return has_browser_steps
@property
def has_restock_info(self):
# has either price or availability
if self.get('restock'):
if self['restock'].get('price') != None:
return True
if self['restock'].get('availability') != None:
return True
return False
# Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0.
@property
def newest_history_key(self):

@ -0,0 +1,77 @@
import os
import uuid
from changedetectionio import strtobool
from changedetectionio.notification import default_notification_format_for_watch
class Restock(dict):
def __init__(self, *args, **kwargs):
default_values = {'in_stock': None, 'price': None, 'currency': None}
default_values.update(dict(*args, **kwargs))
super().__init__(default_values.copy())
class watch_base(dict):
def __init__(self, *arg, **kw):
self.update({
'body': None,
'browser_steps': [],
'browser_steps_last_error_step': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0,
'date_created': None,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
'fetch_backend': 'system', # plaintext, playwright etc
'fetch_time': 0.0,
'processor': 'text_json_diff', # could be restock_diff or others from .processors
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'filter_text_added': True,
'filter_text_replaced': True,
'filter_text_removed': True,
'has_ldjson_price_data': None,
'track_ldjson_price_data': None,
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'in_stock_only': True, # Only trigger change on going to instock from out-of-stock
'include_filters': [],
'last_checked': 0,
'last_error': False,
'last_viewed': 0, # history key value of the last viewed via the [diff] link
'method': 'GET',
'notification_alert_count': 0,
# Custom notification content
'notification_body': None,
'notification_format': default_notification_format_for_watch,
'notification_muted': False,
'notification_title': None,
'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'paused': False,
'previous_md5': False,
'previous_md5_before_filters': False, # Used for skipping changedetection entirely
'proxy': None, # Preferred proxy connection
'restock': {}, # Restock/price storage
'remote_server_reply': None, # From 'server' reply header
'sort_text_alphabetically': False,
'subtractive_selectors': [],
'tag': '', # Old system of text name for a tag, to be removed
'tags': [], # list of UUIDs to App.Tags
'text_should_not_be_present': [], # Text that should not present
# Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'title': None,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': '',
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
})
super(watch_base, self).__init__(*arg, **kw)

@ -2,13 +2,14 @@ from . import difference_detection_processor
from ..html_tools import xpath1_filter as xpath_filter
# xpath1 is a lot faster and is sufficient here
from ..html_tools import extract_json_as_string, has_ldjson_product_info
from ..model import Restock
from copy import deepcopy
from loguru import logger
import hashlib
import json
import re
import urllib3
import extruct
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -22,75 +23,65 @@ class UnableToExtractRestockData(Exception):
self.status_code = status_code
return
def _search_prop_by_value(matches, value):
for properties in matches:
for prop in properties:
if value in prop[0]:
return prop[1] # Yield the desired value and exit the function
def get_itemprop_availability(html_content):
# should return Restock()
# add casting?
def get_itemprop_availability(html_content) -> Restock:
"""
Kind of funny/cool way to find price/availability in one many different possibilities.
Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
"""
`itemprop` is a global attribute
https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop
https://schema.org/ItemAvailability
from jsonpath_ng import parse
<div class="product-offer" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
...
<link itemprop="availability" href="https://schema.org/OutOfStock" />
value={}
:return:
"""
# Try/prefer the structured data first if it exists
# https://schema.org/ItemAvailability Which strings mean we should consider it in stock?
# Chewing on random content could throw any kind of exception, best to catch it and move on if possible.
# LD-JSON type
value = {'price': None, 'availability': None, 'currency': None}
try:
if has_ldjson_product_info(html_content):
res = extract_json_as_string(html_content.lower(), "json:$..offers", ensure_is_ldjson_info_type=True)
if res:
logger.debug(f"Has 'LD-JSON' - '{value}'")
ld_obj = json.loads(res)
if ld_obj and isinstance(ld_obj, list):
ld_obj = ld_obj[0]
# DOnt know why but it happens..
if ld_obj.get('pricespecification'):
value['price'] = ld_obj['pricespecification'].get('price')
value['currency'] = ld_obj['pricespecification']['pricecurrency'].upper() if ld_obj['pricespecification'].get('pricecurrency') else None
else:
value['price'] = ld_obj.get('price')
value['currency'] = ld_obj['pricecurrency'].upper() if ld_obj.get('pricecurrency') else None
value['availability'] = ld_obj['availability'] if ld_obj.get('availability') else None
now = time.time()
data = extruct.extract(html_content)
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'LD-JSON' - {str(e)}")
# First phase, dead simple scanning of anything that looks useful
if data:
logger.debug(f"Using jsonpath to find price/availability/etc")
price_parse = parse('$..(price|Price)')
pricecurrency_parse = parse('$..(pricecurrency|currency| priceCurrency )')
availability_parse = parse('$..(availability|Availability)')
# Microdata style
if not value.get('price'):
try:
res = xpath_filter("//*[@itemtype='https://schema.org/Offer']//*[@itemprop='availability']/@href", html_content)
if res:
#@todo
logger.debug(f"Has 'Microdata' - '{value}'")
price_result = price_parse.find(data)
if price_result:
value['price'] = price_result[0].value
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'Microdata' - {str(e)}")
pricecurrency_result = pricecurrency_parse.find(data)
if pricecurrency_result:
value['currency'] = pricecurrency_result[0].value
# RDFa style
if not value.get('price'):
try:
res = xpath_filter("//*[@property='schema:availability']/@content", html_content)
# @todo
if res:
logger.debug(f"Has 'RDFa' - '{value}'")
availability_result = availability_parse.find(data)
if availability_result:
value['availability'] = availability_result[0].value
if value.get('availability'):
value['availability'] = re.sub(r'(?i)^(https|http)://schema.org/', '',
value.get('availability').strip(' "\'').lower()) if value.get('availability') else None
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'RDFa' - {str(e)}")
# Second, go dig OpenGraph which is something that jsonpath_ng cant do because of the tuples and double-dots (:)
if not value.get('price') or value.get('availability'):
logger.debug(f"Alternatively digging through OpenGraph properties for restock/price info..")
jsonpath_expr = parse('$..properties')
for match in jsonpath_expr.find(data):
if not value.get('price'):
value['price'] = _search_prop_by_value([match.value], "price:amount")
if not value.get('availability'):
value['availability'] = _search_prop_by_value([match.value], "product:availability")
if not value.get('currency'):
value['currency'] = _search_prop_by_value([match.value], "price:currency")
value['availability'] = re.sub(r'(?i)^(https|http)://schema.org/', '', value.get('availability').strip(' "\'').lower()) if value.get('availability') else None
logger.trace(f"Processed with Extruct in {time.time()-now:.3f}s")
# @todo this should return dict/tuple of instock + price
return value
return Restock(value)
class perform_site_check(difference_detection_processor):
screenshot = None
@ -106,8 +97,7 @@ class perform_site_check(difference_detection_processor):
raise Exception("Watch no longer exists.")
# Unset any existing notification error
from changedetectionio.model.Watch import Restock
update_obj = {'last_notification_error': False, 'last_error': False, 'restock': Restock}
update_obj = {'last_notification_error': False, 'last_error': False, 'restock': None}
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
@ -121,6 +111,7 @@ class perform_site_check(difference_detection_processor):
# Store for other usage
update_obj['restock'] = itemprop_availability
if itemprop_availability.get('availability'):
# @todo: Configurable?
if any(substring.lower() in itemprop_availability['availability'].lower() for substring in [
'instock',

@ -242,7 +242,7 @@ class ChangeDetectionStore:
# Remove a watchs data but keep the entry (URL etc)
def clear_watch_history(self, uuid):
import pathlib
from .model import Restock
self.__data['watching'][uuid].update({
'browser_steps_last_error_step' : None,
'check_count': 0,
@ -257,6 +257,7 @@ class ChangeDetectionStore:
'previous_md5_before_filters': False,
'remote_server_reply': None,
'track_ldjson_price_data': None,
'restock': Restock()
})
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
@ -623,7 +624,8 @@ class ChangeDetectionStore:
# Eventually almost everything todo with a watch will apply as a Tag
# So we use the same model as a Watch
with self.lock:
new_tag = Watch.model(datastore_path=self.datastore_path, default={
from .model import Tag
new_tag = Tag.model(datastore_path=self.datastore_path, default={
'title': name.strip(),
'date_created': int(time.time())
})

@ -42,7 +42,7 @@
<li class="tab" id=""><a href="#general">General</a></li>
<li class="tab"><a href="#request">Request</a></li>
{% if watch['processor'] == 'restock_diff' %}
<li class="tab"><a href="#restock">Restock Detection</a></li>
<li class="tab"><a href="#restock">Restock & Price Detection</a></li>
{% endif %}
{% if playwright_enabled %}
<li class="tab"><a id="browsersteps-tab" href="#browser-steps">Browser Steps</a></li>

@ -99,6 +99,7 @@
{% if watch.last_notification_error is defined and watch.last_notification_error != False %}error{% endif %}
{% if watch.paused is defined and watch.paused != False %}paused{% endif %}
{% if is_unviewed %}unviewed{% endif %}
{% if watch.has_restock_info %}has-restock-info {% if watch['restock']['in_stock'] %}in-stock{% else %}not-in-stock{% endif %}{% endif %}
{% if watch.uuid in queued_uuids %}queued{% endif %}">
<td class="inline checkbox-uuid" ><input name="uuids" type="checkbox" value="{{ watch.uuid}} " > <span>{{ loop.index+pagination.skip }}</span></td>
<td class="inline watch-controls">
@ -156,20 +157,24 @@
{% if any_has_restock_price_processor %}
<td class="restock-and-price">
{% if watch['processor'] == 'restock_diff' %}
{% if watch['restock']['in_stock'] != None %}
{% if watch.get('restock') and watch['restock']['in_stock'] != None %}
<span class="restock-label {{'in-stock' if watch['restock']['in_stock'] else 'not-in-stock' }}" title="Detecting restock and price">
<!-- maybe some object watch['processor'][restock_diff] or.. -->
{% if watch['restock']['in_stock'] %} In stock {% else %} Not in stock {% endif %}
</span>
{% else %}
<span class="restock-label">Not yet checked</span>
{% endif %}
{% if watch.get('restock') and watch['restock']['price'] != None %}
{% if watch['restock']['price'] != None %}
<span class="restock-label price {{'in-stock' if watch['restock']['in_stock'] else 'not-in-stock' }}" title="Price">
{{ watch['restock']['price'] }} {{ watch['restock']['currency'] }}
</span>
{% endif %}
{% elif not watch.has_restock_info %}
<span class="restock-label">No information</span>
{% endif %}
{% endif %}
</td>
{% endif %}

@ -5,6 +5,8 @@ from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_clie
instock_props = [
# LD+JSON with non-standard list of 'type' https://github.com/dgtlmoon/changedetection.io/issues/1833
'<script type=\'application/ld+json\'>{"@context": "http://schema.org","@type": ["Product", "SubType"],"name": "My test product","description":"","Offers": { "@type": "Offer", "offeredBy": { "@type": "Organization", "name":"Person", "telephone":"+1 999 999 999" }, "price": "1", "priceCurrency": "EUR", "url": "/some/url", "availability": "http://schema.org/InStock"} }</script>',
# LD JSON
'<script type=\'application/ld+json\'>[{"@context":"http://schema.org","@type":"WebSite","name":"Velkoobchod České Díly.cz","description":"Velkoobchodní a maloobchodní prodej originálních a náhradních dílů pro širokou škálu osobních a užitkových vozidel. Jsme největší obchod s náhradními díly v Čechách. Kamenná prodejna v Praze. Široký výběr značek za nejnižší ceny na trhu. MANN-FILTER, Bosch, LUK, VALEO, KYB, NGK, TRW, Brembo, SACHS, FEBI BILSTENI, ATE, INA, CONTIT.VlastnímeECH, PIERBURG, CASTROL , MOTUL, MOBIL, SHELL ,TOTAL ,elf ,LIQUI MOLY , wynn`s a další. Autodoplňky. Autokosmetika. Vybavení pro dílny. Nabídka olejů všech druhů a značek. Nejlevnější autodlíly.","url":"https://ceskedily.cz/autodily/dodge/challenger-kupe/5.7-280kw/filtr?productId=3038915","potentialAction":{"@type":"SearchAction","target":"https://ceskedily.cz/vyhledavani?search={query}","query-input":{"@type":"PropertyValueSpecification","valueRequired":"http://schema.org/True","valueName":"query"}},"publisher":{"@context":"http://schema.org","@type":"Organization","name":"Velkoobchod České Díly.cz","url":"https://ceskedily.cz/","logo":"https://data.kvikymart.space/ceskedily.cz/images/0m/77k/77026/77026_3195959275.png","sameAs":["https://twitter.com/CeskeD","https://www.instagram.com/ceskedily/?hl=cs"]},"sameAs":["https://twitter.com/CeskeD","https://www.instagram.com/ceskedily/?hl=cs"]},{"@context":"http://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":0,"item":{"@id":"/autodily","name":"Autodíly pro osobní vozy"}},{"@type":"ListItem","position":1,"item":{"@id":"/autodily/dodge","name":"DODGE"}},{"@type":"ListItem","position":2,"item":{"@id":"https://ceskedily.cz/autodily/dodge/challenger-kupe/5.7-280kw","name":"CHALLENGER kupé • 5.7 • 280 kW"}}]},{"@context":"http://schema.org","@type":"Product","name":"Olejový filtr K&N Filters HP-2010","description":"","mpn":"HP-2010","brand":"K&N Filters","image":"https://digital-assets.tecalliance.services/images/1600/c8fe1f1428021f4fe17a39297686178b04cba885.jpg","offers":{"@context":"http://schema.org","@type":"Offer","price":294.0,"priceCurrency":"CZK","url":"https://ceskedily.cz/olejovy-filtr-k-n-filters-hp-2010","availability":"http://schema.org/InStock"}}]</script>',
'<script id="product-jsonld" type="application/ld+json">{"@context":"https://schema.org","@type":"Product","brand":{"@type":"Brand","name":"Ubiquiti"},"name":"UniFi Express","sku":"UX","description":"Impressively compact UniFi Cloud Gateway and WiFi 6 access point that runs UniFi Network. Powers an entire network or simply meshes as an access point.","url":"https://store.ui.com/us/en/products/ux","image":{"@type":"ImageObject","url":"https://cdn.ecomm.ui.com/products/4ed25b4c-db92-4b98-bbf3-b0989f007c0e/123417a2-895e-49c7-ba04-b6cd8f6acc03.png","width":"1500","height":"1500"},"offers":{"@type":"Offer","availability":"https://schema.org/InStock","priceSpecification":{"@type":"PriceSpecification","price":149,"priceCurrency":"USD","valueAddedTaxIncluded":false}}}</script>',
@ -52,8 +54,8 @@ def test_restock_itemprop_basic(client, live_server):
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b' in-stock' in res.data
assert b'has-restock-info in-stock' in res.data
assert b'has-restock-info not-in-stock' not in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
@ -67,7 +69,7 @@ def test_restock_itemprop_basic(client, live_server):
)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'not-in-stock' in res.data
assert b'has-restock-info not-in-stock' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

@ -85,3 +85,6 @@ pytest-flask ~=1.2
jsonschema==4.17.3
loguru
# For scraping all possible metadata relating to products so we can do better restock detection
extruct
Loading…
Cancel
Save