You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
changedetection.io/changedetectionio/processors/restock_diff.py

150 lines
5.8 KiB

from . import difference_detection_processor
9 months ago
from ..html_tools import xpath1_filter as xpath_filter
# xpath1 is a lot faster and is sufficient here
from ..html_tools import extract_json_as_string, has_ldjson_product_info
from copy import deepcopy
from loguru import logger
import hashlib
1 year ago
import re
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Re-stock detection for single product pages'
description = 'Detects if the product goes back to in-stock'
1 year ago
class UnableToExtractRestockData(Exception):
def __init__(self, status_code):
# Set this so we can use it in other parts of the app
self.status_code = status_code
return
1 year ago
9 months ago
def get_itemprop_availability(html_content):
"""
`itemprop` is a global attribute
https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop
https://schema.org/ItemAvailability
<div class="product-offer" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
...
<link itemprop="availability" href="https://schema.org/OutOfStock" />
:return:
"""
# Try/prefer the structured data first if it exists
# https://schema.org/ItemAvailability Which strings mean we should consider it in stock?
9 months ago
# Chewing on random content could throw any kind of exception, best to catch it and move on if possible.
#LD-JSON type
9 months ago
value = None
try:
9 months ago
if has_ldjson_product_info(html_content):
value = extract_json_as_string(html_content.lower(), "json:$..offers.availability", ensure_is_ldjson_info_type=True)
9 months ago
if value:
value = re.sub(r'(?i)^(https|http)://schema.org/', '', value.strip(' "\''))
9 months ago
9 months ago
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'LD-JSON' - {str(e)}")
9 months ago
9 months ago
# Microdata style
if not value:
try:
value = xpath_filter("//*[@itemtype='https://schema.org/Offer']//*[@itemprop='availability']/@href", html_content)
if value:
value = re.sub(r'(?i)^(https|http)://schema.org/', '', value.strip(' "\'').lower())
9 months ago
9 months ago
except Exception as e:
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'Microdata' - {str(e)}")
9 months ago
9 months ago
# RDFa style
9 months ago
if not value:
try:
9 months ago
value = xpath_filter("//*[@property='schema:availability']/@content", html_content)
if value:
9 months ago
value = re.sub(r'(?i)^(https|http)://schema.org/', '', value.strip(' "\'').lower())
except Exception as e:
9 months ago
# This should be OK, we will attempt the scraped version instead
logger.warning(f"Exception getting get_itemprop_availability 'RDFa' - {str(e)}")
9 months ago
# @todo this should return dict/tuple of instock + price
9 months ago
return value
9 months ago
class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
def run_changedetection(self, uuid, skip_when_checksum_same=True):
# DeepCopy so we can be sure we don't accidently change anything by reference
watch = deepcopy(self.datastore.data['watching'].get(uuid))
if not watch:
raise Exception("Watch no longer exists.")
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False, 'in_stock': None}
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type
update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '')
update_obj["last_check_status"] = self.fetcher.get_last_status_code()
9 months ago
availability = get_itemprop_availability(html_content=self.fetcher.content)
if availability:
1 year ago
self.fetcher.instock_data = availability # Stored as the text snapshot
9 months ago
# @todo: Configurable?
if any(availability in s for s in
[
9 months ago
'instock',
'Instoreonly',
'limitedavailability',
'onlineonly',
'presale' # Debatable?
]):
1 year ago
update_obj['in_stock'] = True
else:
1 year ago
update_obj['in_stock'] = False
# Fallback to scraping the content for keywords (done in JS)
if update_obj['in_stock'] == None and self.fetcher.instock_data:
# 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
1 year ago
update_obj['in_stock'] = True if self.fetcher.instock_data == 'Possibly in stock' else False
logger.debug(f"Watch UUID {uuid} restock check returned '{self.fetcher.instock_data}' from JS scraper.")
9 months ago
if not self.fetcher.instock_data:
raise UnableToExtractRestockData(status_code=self.fetcher.status_code)
# Main detection method
fetched_md5 = None
9 months ago
fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest()
# The main thing that all this at the moment comes down to :)
changed_detected = False
logger.debug(f"Watch UUID {uuid} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}")
1 year ago
if watch.get('in_stock') != update_obj.get('in_stock'):
# Yes if we only care about it going to instock, AND we are in stock
1 year ago
if watch.get('in_stock_only') and update_obj['in_stock']:
changed_detected = True
if not watch.get('in_stock_only'):
# All cases
changed_detected = True
# Always record the new checksum
update_obj["previous_md5"] = fetched_md5
return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8').strip()