From 730f37c7ba2c8d2ec5dbfa56315a37b106f73981 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 19 Aug 2024 09:17:18 +0200 Subject: [PATCH 01/12] Set encoding type for scraper script reader (#2574 #2568) --- changedetectionio/content_fetchers/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 66dd7403..c764f77e 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -65,8 +65,8 @@ class Fetcher(): def __init__(self): import importlib.resources - self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text() - self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text() + self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8') + self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8') @abstractmethod def get_error(self): From bf0d410d32ad78fcb7399e4bc5d6b4250dcfd133 Mon Sep 17 00:00:00 2001 From: Mike Splain Date: Mon, 19 Aug 2024 04:21:05 -0400 Subject: [PATCH 02/12] Browser Steps UI - Interactive UI wasn't sending headers but was when the check ran (#2551) --- changedetectionio/blueprint/browser_steps/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index f92bf9f8..a472ba4b 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -85,7 +85,8 @@ def construct_blueprint(datastore: ChangeDetectionStore): browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui( playwright_browser=browsersteps_start_session['browser'], proxy=proxy, - start_url=datastore.data['watching'][watch_uuid].get('url') + start_url=datastore.data['watching'][watch_uuid].get('url'), + headers=datastore.data['watching'][watch_uuid].get('headers') ) # For test From 932cf15e1ea2df3f0b4cdae137a5455019fea718 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 19 Aug 2024 15:47:19 +0200 Subject: [PATCH 03/12] Price and restock scraping - small price fix scraper (#2575) --- .../processors/restock_diff/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/changedetectionio/processors/restock_diff/__init__.py b/changedetectionio/processors/restock_diff/__init__.py index 1aeca8af..3d472bee 100644 --- a/changedetectionio/processors/restock_diff/__init__.py +++ b/changedetectionio/processors/restock_diff/__init__.py @@ -1,11 +1,12 @@ +from babel.numbers import parse_decimal from changedetectionio.model.Watch import model as BaseWatch +from typing import Union import re -from babel.numbers import parse_decimal class Restock(dict): - def parse_currency(self, raw_value: str) -> float: + def parse_currency(self, raw_value: str) -> Union[float, None]: # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer. standardized_value = raw_value @@ -21,8 +22,11 @@ class Restock(dict): # Remove any non-numeric characters except for the decimal point standardized_value = re.sub(r'[^\d.-]', '', standardized_value) - # Convert to float - return float(parse_decimal(standardized_value, locale='en')) + if standardized_value: + # Convert to float + return float(parse_decimal(standardized_value, locale='en')) + + return None def __init__(self, *args, **kwargs): # Define default values From 371f85d5441a8b4d2cece75476542184355d2231 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 19 Aug 2024 17:20:30 +0200 Subject: [PATCH 04/12] Watch 'Download last snapshot' link/button should give last, not first snapshot (#2576) --- changedetectionio/flask_app.py | 16 +++++++++------- changedetectionio/tests/test_backend.py | 14 +++++++------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 0ecfb75b..fd12393a 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1377,17 +1377,19 @@ def changedetection_app(config=None, datastore_o=None): import brotli watch = datastore.data['watching'].get(uuid) - if watch and os.path.isdir(watch.watch_data_dir): - latest_filename = list(watch.history.keys())[0] + if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir): + latest_filename = list(watch.history.keys())[-1] html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br") - if html_fname.endswith('.br'): - # Read and decompress the Brotli file - with open(html_fname, 'rb') as f: + with open(html_fname, 'rb') as f: + if html_fname.endswith('.br'): + # Read and decompress the Brotli file decompressed_data = brotli.decompress(f.read()) + else: + decompressed_data = f.read() - buffer = BytesIO(decompressed_data) + buffer = BytesIO(decompressed_data) - return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html') + return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html') # Return a 500 error diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py index f7c259eb..4a1d66fb 100644 --- a/changedetectionio/tests/test_backend.py +++ b/changedetectionio/tests/test_backend.py @@ -69,6 +69,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure wait_for_all_checks(client) + uuid = extract_UUID_from_client(client) + + # Check the 'get latest snapshot works' + res = client.get(url_for("watch_get_latest_html", uuid=uuid)) + assert b'which has this one new line' in res.data + # Now something should be ready, indicated by having a 'unviewed' class res = client.get(url_for("index")) assert b'unviewed' in res.data @@ -86,7 +92,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert expected_url.encode('utf-8') in res.data # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times - res = client.get(url_for("diff_history_page", uuid="first")) + res = client.get(url_for("diff_history_page", uuid=uuid)) assert b'selected=""' in res.data, "Confirm diff history page loaded" # Check the [preview] pulls the right one @@ -143,18 +149,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert b'unviewed' not in res.data # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again - uuid = extract_UUID_from_client(client) client.get(url_for("clear_watch_history", uuid=uuid)) client.get(url_for("form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) res = client.get(url_for("index")) assert b'preview/' in res.data - - # Check the 'get latest snapshot works' - res = client.get(url_for("watch_get_latest_html", uuid=uuid)) - assert b'head title' in res.data - # # Cleanup everything res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) From 6dd1fa2b885b191776a4d139809e54fd4ba1f718 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 19 Aug 2024 17:22:13 +0200 Subject: [PATCH 05/12] 0.46.03 --- changedetectionio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 5ec6f891..1872a520 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -2,7 +2,7 @@ # Read more https://github.com/dgtlmoon/changedetection.io/wiki -__version__ = '0.46.02' +__version__ = '0.46.03' from changedetectionio.strtobool import strtobool from json.decoder import JSONDecodeError From 7071df061ad06048b4b0c5522e4d54171a7aa371 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 22 Aug 2024 15:01:36 +0200 Subject: [PATCH 06/12] Price detection/scraping - Adding extra element training data (#2582) --- .../res/xpath_element_scraper.js | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index 87c0df70..ccd89436 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -164,6 +164,15 @@ visibleElementsArray.forEach(function (element) { } } + let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now + + let text = element.textContent.trim().slice(0, 30).trim(); + while (/\n{2,}|\t{2,}/.test(text)) { + text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t') + } + + // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. + const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; size_pos.push({ xpath: xpath_result, @@ -171,9 +180,16 @@ visibleElementsArray.forEach(function (element) { height: Math.round(bbox['height']), left: Math.floor(bbox['left']), top: Math.floor(bbox['top']) + scroll_y, + // tagName used by Browser Steps tagName: (element.tagName) ? element.tagName.toLowerCase() : '', + // tagtype used by Browser Steps tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', - isClickable: window.getComputedStyle(element).cursor == "pointer" + isClickable: window.getComputedStyle(element).cursor === "pointer", + // Used by the keras trainer + fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), + fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), + hasDigitCurrency: hasDigitCurrency, + label: label, }); }); From 1fb4342488a4af9ef2f396e38d5e4fa48b842928 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 22 Aug 2024 15:02:00 +0200 Subject: [PATCH 07/12] Build - Unpin jsonschema for faster builds (#2583) --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2e085cf6..537c3f80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,8 +79,9 @@ pyppeteerstealth>=0.0.4 pytest ~=7.2 pytest-flask ~=1.2 -# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708) -jsonschema==4.17.3 +# Anything 4.0 and up but not 5.0 +jsonschema ~= 4.0 + loguru From 4225900ec3ae38651c7db24a8f18e297936a87f2 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 1 Sep 2024 12:47:21 +0200 Subject: [PATCH 08/12] Restock - updating texts and text offsets --- changedetectionio/content_fetchers/res/stock-not-in-stock.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 94c6350d..6958a4d5 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -75,6 +75,7 @@ function isItemInStock() { 'vergriffen', 'vorbestellen', 'vorbestellung ist bald möglich', + 'we don\'t currently have any', 'we couldn\'t find any products that match', 'we do not currently have an estimate of when this product will be back in stock.', 'we don\'t know when or if this item will be back in stock.', @@ -173,7 +174,7 @@ function isItemInStock() { const element = elementsToScan[i]; // outside the 'fold' or some weird text in the heading area // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 200) { continue } elementText = ""; @@ -187,7 +188,7 @@ function isItemInStock() { // and these mean its out of stock for (const outOfStockText of outOfStockTexts) { if (elementText.includes(outOfStockText)) { - console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) + console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) return outOfStockText; // item is out of stock } } From 55fe2abf42e819a6292d32021348830ba4b612a3 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 1 Sep 2024 13:07:06 +0200 Subject: [PATCH 09/12] Restock/Price detection - Better catching of errors when parsing metadata documents for restock/price check (#2602) --- changedetectionio/processors/restock_diff/processor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index b2184e35..1a3a96ca 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock: import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") - value = {} now = time.time() - # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. + # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] + try: + data = extruct.extract(html_content, syntaxes=syntaxes) + except Exception as e: + logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}") + return Restock() - data = extruct.extract(html_content, syntaxes=syntaxes) logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") # First phase, dead simple scanning of anything that looks useful From 1cb38347daac33d5a1e20d76b4b14ddf2876097a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 2 Sep 2024 13:20:44 +0200 Subject: [PATCH 10/12] Container name should be 'sockpuppetbrowser' because its not just playwright that uses it --- docker-compose.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2480a339..108ea093 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: # # Log levels are in descending order. (TRACE is the most detailed one) # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL - # - LOGGER_LEVEL=DEBUG + # - LOGGER_LEVEL=TRACE # # Alternative WebDriver/selenium URL, do not use "'s or 's! # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub @@ -29,8 +29,9 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # - # Alternative Playwright URL, do not use "'s or 's! - # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 + # Alternative target "Chrome" Playwright URL, do not use "'s or 's! + # "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser. + # - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 # # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # @@ -73,10 +74,10 @@ services: # condition: service_started - # Used for fetching pages via Playwright+Chrome where you need Javascript support. + # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages. # RECOMMENDED FOR FETCHING PAGES WITH CHROME -# playwright-chrome: -# hostname: playwright-chrome +# sockpuppetbrowser: +# hostname: sockpuppetbrowser # image: dgtlmoon/sockpuppetbrowser:latest # cap_add: # - SYS_ADMIN From 60d292107d9599104e79a890374affd8c2e3af3f Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 2 Sep 2024 15:11:31 +0200 Subject: [PATCH 11/12] Fixing restock monitor tests and tweaking docker default config example, --- changedetectionio/content_fetchers/res/stock-not-in-stock.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 6958a4d5..df33fbe6 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -174,7 +174,8 @@ function isItemInStock() { const element = elementsToScan[i]; // outside the 'fold' or some weird text in the heading area // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 200) { + // Note: theres also an automated test that places the 'out of stock' text fairly low down + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) { continue } elementText = ""; From 5b70625eaabba8ed85bb271f32c6c955513e1ef2 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 4 Sep 2024 13:55:18 +0200 Subject: [PATCH 12/12] 0.46.04 --- changedetectionio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 1872a520..ca262be7 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -2,7 +2,7 @@ # Read more https://github.com/dgtlmoon/changedetection.io/wiki -__version__ = '0.46.03' +__version__ = '0.46.04' from changedetectionio.strtobool import strtobool from json.decoder import JSONDecodeError