diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 5ec6f891..ca262be7 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -2,7 +2,7 @@ # Read more https://github.com/dgtlmoon/changedetection.io/wiki -__version__ = '0.46.02' +__version__ = '0.46.04' from changedetectionio.strtobool import strtobool from json.decoder import JSONDecodeError diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index f92bf9f8..a472ba4b 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -85,7 +85,8 @@ def construct_blueprint(datastore: ChangeDetectionStore): browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui( playwright_browser=browsersteps_start_session['browser'], proxy=proxy, - start_url=datastore.data['watching'][watch_uuid].get('url') + start_url=datastore.data['watching'][watch_uuid].get('url'), + headers=datastore.data['watching'][watch_uuid].get('headers') ) # For test diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 2d29cfea..d6dbbf0b 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -66,8 +66,8 @@ class Fetcher(): def __init__(self): import importlib.resources - self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text() - self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text() + self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8') + self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8') @abstractmethod def get_error(self): diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 94c6350d..df33fbe6 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -75,6 +75,7 @@ function isItemInStock() { 'vergriffen', 'vorbestellen', 'vorbestellung ist bald möglich', + 'we don\'t currently have any', 'we couldn\'t find any products that match', 'we do not currently have an estimate of when this product will be back in stock.', 'we don\'t know when or if this item will be back in stock.', @@ -173,7 +174,8 @@ function isItemInStock() { const element = elementsToScan[i]; // outside the 'fold' or some weird text in the heading area // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { + // Note: theres also an automated test that places the 'out of stock' text fairly low down + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) { continue } elementText = ""; @@ -187,7 +189,7 @@ function isItemInStock() { // and these mean its out of stock for (const outOfStockText of outOfStockTexts) { if (elementText.includes(outOfStockText)) { - console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) + console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) return outOfStockText; // item is out of stock } } diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index 87c0df70..ccd89436 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -164,6 +164,15 @@ visibleElementsArray.forEach(function (element) { } } + let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now + + let text = element.textContent.trim().slice(0, 30).trim(); + while (/\n{2,}|\t{2,}/.test(text)) { + text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t') + } + + // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. + const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; size_pos.push({ xpath: xpath_result, @@ -171,9 +180,16 @@ visibleElementsArray.forEach(function (element) { height: Math.round(bbox['height']), left: Math.floor(bbox['left']), top: Math.floor(bbox['top']) + scroll_y, + // tagName used by Browser Steps tagName: (element.tagName) ? element.tagName.toLowerCase() : '', + // tagtype used by Browser Steps tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', - isClickable: window.getComputedStyle(element).cursor == "pointer" + isClickable: window.getComputedStyle(element).cursor === "pointer", + // Used by the keras trainer + fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), + fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), + hasDigitCurrency: hasDigitCurrency, + label: label, }); }); diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 0ecfb75b..fd12393a 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1377,17 +1377,19 @@ def changedetection_app(config=None, datastore_o=None): import brotli watch = datastore.data['watching'].get(uuid) - if watch and os.path.isdir(watch.watch_data_dir): - latest_filename = list(watch.history.keys())[0] + if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir): + latest_filename = list(watch.history.keys())[-1] html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br") - if html_fname.endswith('.br'): - # Read and decompress the Brotli file - with open(html_fname, 'rb') as f: + with open(html_fname, 'rb') as f: + if html_fname.endswith('.br'): + # Read and decompress the Brotli file decompressed_data = brotli.decompress(f.read()) + else: + decompressed_data = f.read() - buffer = BytesIO(decompressed_data) + buffer = BytesIO(decompressed_data) - return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html') + return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html') # Return a 500 error diff --git a/changedetectionio/processors/restock_diff/__init__.py b/changedetectionio/processors/restock_diff/__init__.py index 1aeca8af..3d472bee 100644 --- a/changedetectionio/processors/restock_diff/__init__.py +++ b/changedetectionio/processors/restock_diff/__init__.py @@ -1,11 +1,12 @@ +from babel.numbers import parse_decimal from changedetectionio.model.Watch import model as BaseWatch +from typing import Union import re -from babel.numbers import parse_decimal class Restock(dict): - def parse_currency(self, raw_value: str) -> float: + def parse_currency(self, raw_value: str) -> Union[float, None]: # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer. standardized_value = raw_value @@ -21,8 +22,11 @@ class Restock(dict): # Remove any non-numeric characters except for the decimal point standardized_value = re.sub(r'[^\d.-]', '', standardized_value) - # Convert to float - return float(parse_decimal(standardized_value, locale='en')) + if standardized_value: + # Convert to float + return float(parse_decimal(standardized_value, locale='en')) + + return None def __init__(self, *args, **kwargs): # Define default values diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index b2184e35..1a3a96ca 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock: import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") - value = {} now = time.time() - # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. + # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] + try: + data = extruct.extract(html_content, syntaxes=syntaxes) + except Exception as e: + logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}") + return Restock() - data = extruct.extract(html_content, syntaxes=syntaxes) logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") # First phase, dead simple scanning of anything that looks useful diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py index f7c259eb..4a1d66fb 100644 --- a/changedetectionio/tests/test_backend.py +++ b/changedetectionio/tests/test_backend.py @@ -69,6 +69,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure wait_for_all_checks(client) + uuid = extract_UUID_from_client(client) + + # Check the 'get latest snapshot works' + res = client.get(url_for("watch_get_latest_html", uuid=uuid)) + assert b'which has this one new line' in res.data + # Now something should be ready, indicated by having a 'unviewed' class res = client.get(url_for("index")) assert b'unviewed' in res.data @@ -86,7 +92,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert expected_url.encode('utf-8') in res.data # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times - res = client.get(url_for("diff_history_page", uuid="first")) + res = client.get(url_for("diff_history_page", uuid=uuid)) assert b'selected=""' in res.data, "Confirm diff history page loaded" # Check the [preview] pulls the right one @@ -143,18 +149,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert b'unviewed' not in res.data # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again - uuid = extract_UUID_from_client(client) client.get(url_for("clear_watch_history", uuid=uuid)) client.get(url_for("form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) res = client.get(url_for("index")) assert b'preview/' in res.data - - # Check the 'get latest snapshot works' - res = client.get(url_for("watch_get_latest_html", uuid=uuid)) - assert b'head title' in res.data - # # Cleanup everything res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) diff --git a/docker-compose.yml b/docker-compose.yml index 2480a339..108ea093 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: # # Log levels are in descending order. (TRACE is the most detailed one) # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL - # - LOGGER_LEVEL=DEBUG + # - LOGGER_LEVEL=TRACE # # Alternative WebDriver/selenium URL, do not use "'s or 's! # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub @@ -29,8 +29,9 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # - # Alternative Playwright URL, do not use "'s or 's! - # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 + # Alternative target "Chrome" Playwright URL, do not use "'s or 's! + # "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser. + # - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 # # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # @@ -73,10 +74,10 @@ services: # condition: service_started - # Used for fetching pages via Playwright+Chrome where you need Javascript support. + # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages. # RECOMMENDED FOR FETCHING PAGES WITH CHROME -# playwright-chrome: -# hostname: playwright-chrome +# sockpuppetbrowser: +# hostname: sockpuppetbrowser # image: dgtlmoon/sockpuppetbrowser:latest # cap_add: # - SYS_ADMIN diff --git a/requirements.txt b/requirements.txt index 2e085cf6..537c3f80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,8 +79,9 @@ pyppeteerstealth>=0.0.4 pytest ~=7.2 pytest-flask ~=1.2 -# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708) -jsonschema==4.17.3 +# Anything 4.0 and up but not 5.0 +jsonschema ~= 4.0 + loguru