From cf3f3e44977f72c12d73b5807c4998993d236332 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 7 May 2023 13:06:11 +0200 Subject: [PATCH 1/4] BrowserSteps - BrowserSteps was not always following proxy information --- changedetectionio/blueprint/browser_steps/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index 381b2e59..37f18a81 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -169,7 +169,18 @@ def construct_blueprint(datastore: ChangeDetectionStore): if proxy_id: proxy_url = datastore.proxy_list.get(proxy_id).get('url') if proxy_url: + + # Playwright needs separate username and password values + from urllib.parse import urlparse + parsed = urlparse(proxy_url) proxy = {'server': proxy_url} + + if parsed.username: + proxy['username'] = parsed.username + + if parsed.password: + proxy['password'] = parsed.password + print("Browser Steps: UUID {} Using proxy {}".format(uuid, proxy_url)) # Begin the new "Playwright Context" that re-uses the playwright interface From d814535dc6f13d22d8f33ce443ed0c9c02374238 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 7 May 2023 13:10:56 +0200 Subject: [PATCH 2/4] Element scraper - wrap offset detection in try/catch --- changedetectionio/res/xpath_element_scraper.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/changedetectionio/res/xpath_element_scraper.js b/changedetectionio/res/xpath_element_scraper.js index e363d2a1..27a54c2c 100644 --- a/changedetectionio/res/xpath_element_scraper.js +++ b/changedetectionio/res/xpath_element_scraper.js @@ -8,8 +8,15 @@ // Some pages like https://www.londonstockexchange.com/stock/NCCL/ncondezi-energy-limited/analysis // will automatically force a scroll somewhere, so include the position offset // Lets hope the position doesnt change while we iterate the bbox's, but this is better than nothing +var scroll_y = 0; +try { + scroll_y = +document.documentElement.scrollTop || document.body.scrollTop +} catch (e) { + console.log(e); +} + + -var scroll_y=+document.documentElement.scrollTop || document.body.scrollTop // Include the getXpath script directly, easier than fetching function getxpath(e) { From 3801d339f5f803de84ff18c36d8436e09c6011b8 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 7 May 2023 13:47:17 +0200 Subject: [PATCH 3/4] UI - Adding shortcut list select button for "clear/reset history" --- changedetectionio/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index aeb6f555..1afddfd1 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -1313,6 +1313,13 @@ def changedetection_app(config=None, datastore_o=None): update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) flash("{} watches queued for rechecking".format(len(uuids))) + elif (op == 'clear-history'): + for uuid in uuids: + uuid = uuid.strip() + if datastore.data['watching'].get(uuid): + datastore.clear_watch_history(uuid) + flash("{} watches cleared/reset.".format(len(uuids))) + elif (op == 'notification-default'): from changedetectionio.notification import ( default_notification_format_for_watch From 316f28a0f2d4081455b2740d050155bca30cd44e Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 7 May 2023 13:49:53 +0200 Subject: [PATCH 4/4] Fetcher - Experimental fetcher fixes, now only enabled with 'USE_EXPERIMENTAL_PUPPETEER_FETCH' env var (default off) (#1561) --- changedetectionio/content_fetcher.py | 115 +++++++++++++++------------ changedetectionio/update_worker.py | 3 +- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 2d8a66ab..262ac2b4 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -277,7 +277,7 @@ class base_html_playwright(Fetcher): with open(destination, 'w') as f: f.write(content) - def run(self, + def run_fetch_browserless_puppeteer(self, url, timeout, request_headers, @@ -287,63 +287,63 @@ class base_html_playwright(Fetcher): current_include_filters=None, is_binary=False): - # Fallback for now to the old way if browsersteps - # @todo - need to figure out how to get browsersteps with images on each step working - if self.browser_steps: - for step in self.browser_steps: - if step.get('operation'): - return self.run_playwright( - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes, - current_include_filters, - is_binary) - elif os.getenv('FORCE_PLAYWRIGHT_FETCH'): - # Temporary backup solution until we rewrite the playwright code - return self.run_playwright( - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes, - current_include_filters, - is_binary) - - extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000 xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) code = f"""module.exports = async ({{ page, context }}) => {{ - var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy}} = context; + var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password}} = context; await page.setBypassCSP(true) await page.setExtraHTTPHeaders(req_headers); await page.setUserAgent(user_agent); + // https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded - if(proxy) {{ + await page.setDefaultNavigationTimeout(0); + + if(proxy_username) {{ await page.authenticate({{ - username: proxy['username'], - password: proxy['password'], + username: proxy_username, + password: proxy_password }}); }} - - const r = await page.goto(url, wait_until='commit'); - await page.waitForTimeout(extra_wait_ms) + + await page.setViewport({{ + width: 1024, + height: 768, + deviceScaleFactor: 1, + }}); + + const r = await page.goto(url, {{ + waitUntil: 'load' + }}); + + await page.waitForTimeout(1000); + await page.waitForTimeout(extra_wait_ms); if(execute_js) {{ await page.evaluate(execute_js); await page.waitForTimeout(200); }} - const xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters); - const instock_data = await page.evaluate(() => {{ {self.instock_data_js} }}); - - const html = await page.content(); - const b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }}); + var html = await page.content(); + var xpath_data; + var instock_data; + try {{ + xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters); + instock_data = await page.evaluate(() => {{ {self.instock_data_js} }}); + }} catch (e) {{ + console.log(e); + }} + + // Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure + // Wrap it here (for now) + var b64s; + try {{ + b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }}); + }} catch (e) {{ + console.log(e); + }} + return {{ data: {{ 'content': html, @@ -387,15 +387,16 @@ class base_html_playwright(Fetcher): 'execute_js': self.webdriver_js_execute_code, 'extra_wait_ms': extra_wait_ms, 'include_filters': current_include_filters, - 'proxy': self.proxy, 'req_headers': request_headers, 'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)), 'url': url, 'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'), + 'proxy_username': self.proxy.get('username','') if self.proxy else False, + 'proxy_password': self.proxy.get('password','') if self.proxy else False, } }, # @todo /function needs adding ws:// to http:// rebuild this - url=browserless_function_url, + url=browserless_function_url+"&--disable-features=AudioServiceOutOfProcess&dumpio=true", timeout=wait_browserless_seconds) except ReadTimeout: @@ -427,15 +428,27 @@ class base_html_playwright(Fetcher): # Some other error from browserless raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8')) - def run_playwright(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False): + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False, + current_include_filters=None, + is_binary=False): + + if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): + # Temporary backup solution until we rewrite the playwright code + return self.run_fetch_browserless_puppeteer( + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes, + current_include_filters, + is_binary) from playwright.sync_api import sync_playwright import playwright._impl._api_types diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 9fb49c45..96fd5b1e 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -315,7 +315,8 @@ class update_worker(threading.Thread): self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, - 'last_check_status': e.status_code}) + 'last_check_status': e.status_code, + 'has_ldjson_price_data': None}) process_changedetection_results = False except Exception as e: self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))