changedetection.io/changedetectionio/processors/restock_diff.py


import hashlib
import os
import re
import urllib3
from . import difference_detection_processor
from changedetectionio import content_fetcher
from copy import deepcopy

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

name = 'Re-stock detection for single product pages'
description = 'Detects if the product goes back to in-stock'

class UnableToExtractRestockData(Exception):
    def __init__(self, status_code):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        return

class perform_site_check(difference_detection_processor):
    screenshot = None
    xpath_data = None

    def __init__(self, *args, datastore, **kwargs):
        super().__init__(*args, **kwargs)
        self.datastore = datastore

    def run(self, uuid, skip_when_checksum_same=True):

        # DeepCopy so we can be sure we don't accidently change anything by reference
        watch = deepcopy(self.datastore.data['watching'].get(uuid))

        if not watch:
            raise Exception("Watch no longer exists.")

        # Protect against file:// access
        if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
            raise Exception(
                "file:// type access is denied for security reasons."
            )

        # Unset any existing notification error
        update_obj = {'last_notification_error': False, 'last_error': False}

        request_headers = watch.get('headers', [])
        request_headers.update(self.datastore.get_all_base_headers())
        request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid))

        # https://github.com/psf/requests/issues/4525
        # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
        # do this by accident.
        if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']:
            request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '')

        timeout = self.datastore.data['settings']['requests'].get('timeout')

        url = watch.link

        request_body = self.datastore.data['watching'][uuid].get('body')
        request_method = self.datastore.data['watching'][uuid].get('method')
        ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False)

        # Pluggable content fetcher
        prefer_backend = watch.get_fetch_backend
        if not prefer_backend or prefer_backend == 'system':
            prefer_backend = self.datastore.data['settings']['application']['fetch_backend']

        if hasattr(content_fetcher, prefer_backend):
            klass = getattr(content_fetcher, prefer_backend)
        else:
            # If the klass doesnt exist, just use a default
            klass = getattr(content_fetcher, "html_requests")

        proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid)
        proxy_url = None
        if proxy_id:
            proxy_url = self.datastore.proxy_list.get(proxy_id).get('url')
            print("UUID {} Using proxy {}".format(uuid, proxy_url))

        fetcher = klass(proxy_override=proxy_url)

        # Configurable per-watch or global extra delay before extracting text (for webDriver types)
        system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
        if watch['webdriver_delay'] is not None:
            fetcher.render_extract_delay = watch.get('webdriver_delay')
        elif system_webdriver_delay is not None:
            fetcher.render_extract_delay = system_webdriver_delay

        # Could be removed if requests/plaintext could also return some info?
        if prefer_backend != 'html_webdriver':
            raise Exception("Re-stock detection requires Chrome or compatible webdriver/playwright fetcher to work")

        if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip():
            fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code')

        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'))
        fetcher.quit()

        self.screenshot = fetcher.screenshot
        self.xpath_data = fetcher.xpath_data

        # Track the content type
        update_obj['content_type'] = fetcher.headers.get('Content-Type', '')
        update_obj["last_check_status"] = fetcher.get_last_status_code()

        # Main detection method
        fetched_md5 = None
        if fetcher.instock_data:
            fetched_md5 = hashlib.md5(fetcher.instock_data.encode('utf-8')).hexdigest()
            # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
            update_obj["in_stock"] = True if fetcher.instock_data == 'Possibly in stock' else False
        else:
            raise UnableToExtractRestockData(status_code=fetcher.status_code)

        # The main thing that all this at the moment comes down to :)
        changed_detected = False

        if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5:
            # Yes if we only care about it going to instock, AND we are in stock
            if watch.get('in_stock_only') and update_obj["in_stock"]:
                changed_detected = True

            if not watch.get('in_stock_only'):
                # All cases
                changed_detected = True

        # Always record the new checksum
        update_obj["previous_md5"] = fetched_md5

        return changed_detected, update_obj, fetcher.instock_data.encode('utf-8')