diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index b9765bac..75336251 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -173,6 +173,9 @@ class steppable_browser_interface(): v = json.dumps(value) self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=30000) + def action_wait_for_load_state(self, selector, value='networkidle'): + self.page.wait_for_load_state(value) + # @todo - in the future make some popout interface to capture what needs to be set # https://playwright.dev/python/docs/api/class-keyboard def action_press_enter(self, selector, value): @@ -190,6 +193,9 @@ class steppable_browser_interface(): def action_uncheck_checkbox(self, selector, value): self.page.locator(selector, timeout=1000).uncheck(timeout=1000) + def get_locator(self, selector): + return self.page.locator(selector) + # Responsible for maintaining a live 'context' with the chrome CDP # @todo - how long do contexts live for anyway? diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index c764f77e..07a46207 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -52,6 +52,9 @@ class Fetcher(): instock_data_js = "" status_code = None webdriver_js_execute_code = None + webdriver_enable_pagination = False + webdriver_paginated_js_execute_each_page = None + webdriver_paginated_next_selector = None xpath_data = None xpath_element_js = "" diff --git a/changedetectionio/content_fetchers/exceptions/__init__.py b/changedetectionio/content_fetchers/exceptions/__init__.py index 80ebae69..00df385b 100644 --- a/changedetectionio/content_fetchers/exceptions/__init__.py +++ b/changedetectionio/content_fetchers/exceptions/__init__.py @@ -95,3 +95,8 @@ class ReplyWithContentButNoText(Exception): self.html_content = html_content self.xpath_data = xpath_data return + + +class PaginatedContentMisconfigured(Exception): + def __init__(self): + return diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 53be33f1..8e85deb9 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -5,7 +5,7 @@ from urllib.parse import urlparse from loguru import logger from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent -from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable +from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, PaginatedContentMisconfigured, ScreenshotUnavailable class fetcher(Fetcher): fetcher_description = "Playwright {}/Javascript".format( @@ -133,10 +133,13 @@ class fetcher(Fetcher): browser.close() logger.debug("Content Fetcher > Response object from the browser communication was none") raise EmptyReply(url=url, status_code=None) - + try: if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): - browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) + if self.webdriver_enable_pagination == True: + self.run_paginated(url=url) + else: + self.run_normal(browsersteps_interface=browsersteps_interface) except playwright._impl._errors.TimeoutError as e: context.close() browser.close() @@ -147,7 +150,7 @@ class fetcher(Fetcher): context.close() browser.close() raise PageUnloadable(url=url, status_code=None, message=str(e)) - + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay self.page.wait_for_timeout(extra_wait * 1000) @@ -209,3 +212,60 @@ class fetcher(Fetcher): finally: context.close() browser.close() + + def run_normal(self, browsersteps_interface): + """ + Run normal content extraction. + """ + browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) + + def run_paginated(self, url): + """ + Run paginated content extraction in the following order: + 1. Execute initial JS code after the page is loaded + 2. + a. Execute JS code to extract content from the page\n + b. Look for a "next page" button and click it if it exists\n + c. Repeat step 2 until the "next page" button is not found + 3. Write the extracted content to a hidden input element with ID "cd_data" + """ + if self.webdriver_paginated_js_execute_each_page is None or not len(self.webdriver_paginated_js_execute_each_page) \ + or self.webdriver_paginated_next_selector is None or not len(self.webdriver_paginated_next_selector): + raise PaginatedContentMisconfigured() + + from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface + from playwright._impl._errors import TimeoutError + + browsersteps_interface = steppable_browser_interface(start_url=url) + browsersteps_interface.page = self.page + + browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) + browsersteps_interface.action_wait_for_load_state(selector=None) + + data = "" + step_n = 1 + while True: + if data != "": + data += "," + + logger.debug(f"Paginated content > Page {step_n}") + data += browsersteps_interface.action_execute_js(value=self.webdriver_paginated_js_execute_each_page, selector=None) + + try: + next_button = browsersteps_interface.get_locator(self.webdriver_paginated_next_selector) + next_button.wait_for() + next_button.click() + browsersteps_interface.action_wait_for_load_state(selector=None) + step_n += 1 + except TimeoutError: + # This just means the button could not be found. + logger.debug(f"Paginated content > Next button could not be found") + break + + self.page.evaluate('''(data) => { + const el = document.createElement('input'); + el.id = 'cd_data'; + el.type = 'hidden'; + el.value = data; + document.body.appendChild(el); + }''', data) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 81869988..ef2d9b4c 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -494,7 +494,12 @@ class processor_text_json_diff_form(commonSettingsForm): if os.getenv("PLAYWRIGHT_DRIVER_URL"): browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10) text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()]) + webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()]) + + webdriver_enable_pagination = BooleanField('Enable paginated mode', default=False) + webdriver_paginated_js_execute_each_page = TextAreaField('(Paginated) Execute JavaScript on each page', render_kw={"rows": "5"}, validators=[validators.Optional()]) + webdriver_paginated_next_selector = TextAreaField('(Paginated) Next page button selector', validators=[validators.Optional()]) save_button = SubmitField('Save', render_kw={"class": "pure-button button-small pure-button-primary"}) diff --git a/changedetectionio/model/__init__.py b/changedetectionio/model/__init__.py index 36d12384..9d0c3412 100644 --- a/changedetectionio/model/__init__.py +++ b/changedetectionio/model/__init__.py @@ -68,6 +68,9 @@ class watch_base(dict): 'uuid': str(uuid.uuid4()), 'webdriver_delay': None, 'webdriver_js_execute_code': None, # Run before change-detection + 'webdriver_enable_pagination': False, # Run before change-detection + 'webdriver_paginated_js_execute_each_page': None, # Run before change-detection + 'webdriver_paginated_next_selector': None, # Run before change-detection }) super(watch_base, self).__init__(*arg, **kw) diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 2dcc9730..f7b38c8a 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -141,6 +141,12 @@ class difference_detection_processor(): if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip(): self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code') + self.fetcher.webdriver_enable_pagination = self.watch.get('webdriver_enable_pagination', False) + if self.watch.get('webdriver_paginated_js_execute_each_page') is not None and self.watch.get('webdriver_paginated_js_execute_each_page').strip(): + self.fetcher.webdriver_paginated_js_execute_each_page = self.watch.get('webdriver_paginated_js_execute_each_page') + if self.watch.get('webdriver_paginated_next_selector') is not None and self.watch.get('webdriver_paginated_next_selector').strip(): + self.fetcher.webdriver_paginated_next_selector = self.watch.get('webdriver_paginated_next_selector') + # Requests for PDF's, images etc should be passwd the is_binary flag is_binary = self.watch.is_pdf diff --git a/changedetectionio/static/js/vis.js b/changedetectionio/static/js/vis.js index edcacfb6..adb3ae02 100644 --- a/changedetectionio/static/js/vis.js +++ b/changedetectionio/static/js/vis.js @@ -2,6 +2,7 @@ $(document).ready(function () { // Lazy Hide/Show elements mechanism $('[data-visible-for]').hide(); + function show_related_elem(e) { var n = $(e).attr('name') + "=" + $(e).val(); if (n === 'fetch_backend=system') { @@ -9,16 +10,31 @@ $(document).ready(function () { } $(`[data-visible-for~="${n}"]`).show(); } + + function toggle_related_elem(e) { + var n = $(e).attr('name') + "=" + $(e).val(); + if (n === 'fetch_backend=system') { + n = "fetch_backend=" + default_system_fetch_backend; + } + console.log('n: ' + n); + $(`[data-visible-for~="${n}"]`).toggle(); + } + $(':radio').on('keyup keypress blur change click', function (e) { $(`[data-visible-for]`).hide(); $('.advanced-options').hide(); show_related_elem(this); }); - $(':radio:checked').each(function (e) { show_related_elem(this); - }) + }); + $(':checkbox').on('change', function (e) { + toggle_related_elem(this); + }); + $(':checkbox:checked').each(function (e) { + show_related_elem(this); + }); // Show advanced $('.show-advanced').click(function (e) { diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss index ecaf7ed9..63c1ea0c 100644 --- a/changedetectionio/static/styles/scss/styles.scss +++ b/changedetectionio/static/styles/scss/styles.scss @@ -591,6 +591,10 @@ footer { .pure-controls { padding-bottom: 1em; + &.spacing-top { + padding-top: 1rem; + } + div { margin: 0px; } @@ -609,6 +613,10 @@ footer { legend { color: var(--color-text-legend); } + + pre { + margin-bottom: 0; + } } /* The input fields with errors */ diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index 1600fde3..667a2393 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -925,6 +925,10 @@ footer { .pure-form .pure-group, .pure-form .pure-controls { padding-bottom: 1em; } + .pure-form .pure-control-group.spacing-top, + .pure-form .pure-group.spacing-top, + .pure-form .pure-controls.spacing-top { + padding-top: 1rem; } .pure-form .pure-control-group div, .pure-form .pure-group div, .pure-form .pure-controls div { @@ -942,6 +946,10 @@ footer { .pure-form .pure-group legend, .pure-form .pure-controls legend { color: var(--color-text-legend); } + .pure-form .pure-control-group pre, + .pure-form .pure-group pre, + .pure-form .pure-controls pre { + margin-bottom: 0; } .pure-form .error input { background-color: var(--color-error-input); } .pure-form ul.errors { diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 697da5bc..38c4b3e8 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -310,6 +310,9 @@ class ChangeDetectionStore: 'trigger_text', 'url', 'webdriver_js_execute_code', + 'webdriver_enable_pagination', + 'webdriver_paginated_js_execute_each_page', + 'webdriver_paginated_next_selector', ]: if res.get(k): if k != 'css_filter': diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index a0a8f988..4045231c 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -137,6 +137,27 @@ href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More help and examples here + +
+ {{ render_checkbox_field(form.webdriver_enable_pagination) }} +
+ Running in pagination mode will extract data from each page, and inject it on the last in the following element: +
<input type="hidden" id="cd_data" value="<extracted data here>">
+
+
+ +
+ {{ render_field(form.webdriver_paginated_js_execute_each_page) }} +
+ This code will be executed on each page to extract the data. +
+ + {{ render_field(form.webdriver_paginated_next_selector) }} +
+ This selector defines the element that should be clicked to navigate to the next page. If this element cannot bet found, + the watch will interpret it as the last page. +
+
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index af54eff4..d3a0b9ce 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -462,6 +462,16 @@ class update_worker(threading.Thread): if e.message: err_text = "{} - {}".format(err_text, e.message) + if e.screenshot: + watch.save_screenshot(screenshot=e.screenshot, as_error=True) + + self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, + 'last_check_status': e.status_code, + 'has_ldjson_price_data': None}) + process_changedetection_results = False + except content_fetchers_exceptions.PaginatedContentMisconfigured as e: + err_text = "Paginated content fetching is not configured properly. Did you fill in all fields?" + if e.screenshot: watch.save_screenshot(screenshot=e.screenshot, as_error=True)