From a16b12973170d0205271a2975529693a601d09df Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sat, 9 Dec 2023 12:56:02 +0100 Subject: [PATCH] Simplify logic --- changedetectionio/processors/restock_diff.py | 37 ++++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index 67b86970..e7c75311 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -1,4 +1,3 @@ - import hashlib import urllib3 from . import difference_detection_processor @@ -9,24 +8,36 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Re-stock detection for single product pages' description = 'Detects if the product goes back to in-stock' + class UnableToExtractRestockData(Exception): def __init__(self, status_code): # Set this so we can use it in other parts of the app self.status_code = status_code return + class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None def get_itemprop_availability(self): + """ + `itemprop` is a global attribute + https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop + https://schema.org/ItemAvailability + +
+ ... + + + :return: + """ from ..html_tools import xpath_filter import re - # - # https://schema.org/ItemAvailability + value = None try: - value = xpath_filter("//link[@itemprop='availability']/@href", self.fetcher.content) + value = xpath_filter("//*[@itemtype='https://schema.org/Offer']//*[@itemprop='availability']/@href", self.fetcher.content) if value: value = re.sub(r'(?i)^http(s)+://schema.org/', '', value.strip()) @@ -35,7 +46,6 @@ class perform_site_check(difference_detection_processor): return value - def run_changedetection(self, uuid, skip_when_checksum_same=True): # DeepCopy so we can be sure we don't accidently change anything by reference @@ -61,39 +71,36 @@ class perform_site_check(difference_detection_processor): # https://schema.org/ItemAvailability Which strings mean we should consider it in stock? availability = self.get_itemprop_availability() if availability: + self.fetcher.instock_data = availability if any(availability in s for s in [ 'InStock', 'InStoreOnly', 'LimitedAvailability', 'OnlineOnly', - 'PreSale' # Debatable? + 'PreSale' # Debatable? ]): - self.fetcher.instock_data = 'Possibly in stock' + update_obj['in_stock'] = True else: - self.fetcher.instock_data = availability + update_obj['in_stock'] = False # Fallback to scraping the content for keywords (done in JS) if self.fetcher.instock_data: - fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. - update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False + update_obj['in_stock'] = True if self.fetcher.instock_data == 'Possibly in stock' else False else: raise UnableToExtractRestockData(status_code=self.fetcher.status_code) # The main thing that all this at the moment comes down to :) changed_detected = False - if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5: + if watch.get('in_stock') != update_obj.get('in_stock'): # Yes if we only care about it going to instock, AND we are in stock - if watch.get('in_stock_only') and update_obj["in_stock"]: + if watch.get('in_stock_only') and update_obj['in_stock']: changed_detected = True if not watch.get('in_stock_only'): # All cases changed_detected = True - # Always record the new checksum - update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8')