Merge branch 'master' into total-bytes-counter

total-bytes-counter
dgtlmoon 4 months ago
commit 92d715272a

@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki # Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.46.02' __version__ = '0.46.04'
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError

@ -85,7 +85,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui( browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
playwright_browser=browsersteps_start_session['browser'], playwright_browser=browsersteps_start_session['browser'],
proxy=proxy, proxy=proxy,
start_url=datastore.data['watching'][watch_uuid].get('url') start_url=datastore.data['watching'][watch_uuid].get('url'),
headers=datastore.data['watching'][watch_uuid].get('headers')
) )
# For test # For test

@ -66,8 +66,8 @@ class Fetcher():
def __init__(self): def __init__(self):
import importlib.resources import importlib.resources
self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text() self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text() self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
@abstractmethod @abstractmethod
def get_error(self): def get_error(self):

@ -75,6 +75,7 @@ function isItemInStock() {
'vergriffen', 'vergriffen',
'vorbestellen', 'vorbestellen',
'vorbestellung ist bald möglich', 'vorbestellung ist bald möglich',
'we don\'t currently have any',
'we couldn\'t find any products that match', 'we couldn\'t find any products that match',
'we do not currently have an estimate of when this product will be back in stock.', 'we do not currently have an estimate of when this product will be back in stock.',
'we don\'t know when or if this item will be back in stock.', 'we don\'t know when or if this item will be back in stock.',
@ -173,7 +174,8 @@ function isItemInStock() {
const element = elementsToScan[i]; const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area // outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { // Note: theres also an automated test that places the 'out of stock' text fairly low down
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue continue
} }
elementText = ""; elementText = "";
@ -187,7 +189,7 @@ function isItemInStock() {
// and these mean its out of stock // and these mean its out of stock
for (const outOfStockText of outOfStockTexts) { for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) { if (elementText.includes(outOfStockText)) {
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
return outOfStockText; // item is out of stock return outOfStockText; // item is out of stock
} }
} }

@ -164,6 +164,15 @@ visibleElementsArray.forEach(function (element) {
} }
} }
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
let text = element.textContent.trim().slice(0, 30).trim();
while (/\n{2,}|\t{2,}/.test(text)) {
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
}
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,)/.test(text) ;
size_pos.push({ size_pos.push({
xpath: xpath_result, xpath: xpath_result,
@ -171,9 +180,16 @@ visibleElementsArray.forEach(function (element) {
height: Math.round(bbox['height']), height: Math.round(bbox['height']),
left: Math.floor(bbox['left']), left: Math.floor(bbox['left']),
top: Math.floor(bbox['top']) + scroll_y, top: Math.floor(bbox['top']) + scroll_y,
// tagName used by Browser Steps
tagName: (element.tagName) ? element.tagName.toLowerCase() : '', tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
// tagtype used by Browser Steps
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: window.getComputedStyle(element).cursor == "pointer" isClickable: window.getComputedStyle(element).cursor === "pointer",
// Used by the keras trainer
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
hasDigitCurrency: hasDigitCurrency,
label: label,
}); });
}); });

@ -1377,13 +1377,15 @@ def changedetection_app(config=None, datastore_o=None):
import brotli import brotli
watch = datastore.data['watching'].get(uuid) watch = datastore.data['watching'].get(uuid)
if watch and os.path.isdir(watch.watch_data_dir): if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
latest_filename = list(watch.history.keys())[0] latest_filename = list(watch.history.keys())[-1]
html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br") html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f:
if html_fname.endswith('.br'): if html_fname.endswith('.br'):
# Read and decompress the Brotli file # Read and decompress the Brotli file
with open(html_fname, 'rb') as f:
decompressed_data = brotli.decompress(f.read()) decompressed_data = brotli.decompress(f.read())
else:
decompressed_data = f.read()
buffer = BytesIO(decompressed_data) buffer = BytesIO(decompressed_data)

@ -1,11 +1,12 @@
from babel.numbers import parse_decimal
from changedetectionio.model.Watch import model as BaseWatch from changedetectionio.model.Watch import model as BaseWatch
from typing import Union
import re import re
from babel.numbers import parse_decimal
class Restock(dict): class Restock(dict):
def parse_currency(self, raw_value: str) -> float: def parse_currency(self, raw_value: str) -> Union[float, None]:
# Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer. # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
standardized_value = raw_value standardized_value = raw_value
@ -21,9 +22,12 @@ class Restock(dict):
# Remove any non-numeric characters except for the decimal point # Remove any non-numeric characters except for the decimal point
standardized_value = re.sub(r'[^\d.-]', '', standardized_value) standardized_value = re.sub(r'[^\d.-]', '', standardized_value)
if standardized_value:
# Convert to float # Convert to float
return float(parse_decimal(standardized_value, locale='en')) return float(parse_decimal(standardized_value, locale='en'))
return None
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
# Define default values # Define default values
default_values = { default_values = {

@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock:
import extruct import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
value = {}
now = time.time() now = time.time()
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
try:
data = extruct.extract(html_content, syntaxes=syntaxes) data = extruct.extract(html_content, syntaxes=syntaxes)
except Exception as e:
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
return Restock()
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
# First phase, dead simple scanning of anything that looks useful # First phase, dead simple scanning of anything that looks useful

@ -69,6 +69,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
wait_for_all_checks(client) wait_for_all_checks(client)
uuid = extract_UUID_from_client(client)
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'which has this one new line' in res.data
# Now something should be ready, indicated by having a 'unviewed' class # Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data
@ -86,7 +92,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert expected_url.encode('utf-8') in res.data assert expected_url.encode('utf-8') in res.data
# Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
res = client.get(url_for("diff_history_page", uuid="first")) res = client.get(url_for("diff_history_page", uuid=uuid))
assert b'selected=""' in res.data, "Confirm diff history page loaded" assert b'selected=""' in res.data, "Confirm diff history page loaded"
# Check the [preview] pulls the right one # Check the [preview] pulls the right one
@ -143,18 +149,12 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
assert b'unviewed' not in res.data assert b'unviewed' not in res.data
# #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
uuid = extract_UUID_from_client(client)
client.get(url_for("clear_watch_history", uuid=uuid)) client.get(url_for("clear_watch_history", uuid=uuid))
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) wait_for_all_checks(client)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'preview/' in res.data assert b'preview/' in res.data
# Check the 'get latest snapshot works'
res = client.get(url_for("watch_get_latest_html", uuid=uuid))
assert b'<head><title>head title</title></head>' in res.data
# #
# Cleanup everything # Cleanup everything
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)

@ -18,7 +18,7 @@ services:
# #
# Log levels are in descending order. (TRACE is the most detailed one) # Log levels are in descending order. (TRACE is the most detailed one)
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=DEBUG # - LOGGER_LEVEL=TRACE
# #
# Alternative WebDriver/selenium URL, do not use "'s or 's! # Alternative WebDriver/selenium URL, do not use "'s or 's!
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@ -29,8 +29,9 @@ services:
# #
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
# #
# Alternative Playwright URL, do not use "'s or 's! # Alternative target "Chrome" Playwright URL, do not use "'s or 's!
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 # "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
# #
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
# #
@ -73,10 +74,10 @@ services:
# condition: service_started # condition: service_started
# Used for fetching pages via Playwright+Chrome where you need Javascript support. # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
# RECOMMENDED FOR FETCHING PAGES WITH CHROME # RECOMMENDED FOR FETCHING PAGES WITH CHROME
# playwright-chrome: # sockpuppetbrowser:
# hostname: playwright-chrome # hostname: sockpuppetbrowser
# image: dgtlmoon/sockpuppetbrowser:latest # image: dgtlmoon/sockpuppetbrowser:latest
# cap_add: # cap_add:
# - SYS_ADMIN # - SYS_ADMIN

@ -79,8 +79,9 @@ pyppeteerstealth>=0.0.4
pytest ~=7.2 pytest ~=7.2
pytest-flask ~=1.2 pytest-flask ~=1.2
# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708) # Anything 4.0 and up but not 5.0
jsonschema==4.17.3 jsonschema ~= 4.0
loguru loguru

Loading…
Cancel
Save