feat(playwright): add functionality to fetch paginated content

2 months ago · c528f66914
parent 0f0f5af7b5
commit c528f66914
13 changed files with 160 additions and 6 deletions
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@ -173,6 +173,9 @@ class steppable_browser_interface():
        v = json.dumps(value)
        self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=30000)

+    def action_wait_for_load_state(self, selector, value='networkidle'):
+        self.page.wait_for_load_state(value)
+
    # @todo - in the future make some popout interface to capture what needs to be set
    # https://playwright.dev/python/docs/api/class-keyboard
    def action_press_enter(self, selector, value):
@ -190,6 +193,9 @@ class steppable_browser_interface():
    def action_uncheck_checkbox(self, selector, value):
        self.page.locator(selector, timeout=1000).uncheck(timeout=1000)

+    def get_locator(self, selector):
+       return self.page.locator(selector)
+

 # Responsible for maintaining a live 'context' with the chrome CDP
 # @todo - how long do contexts live for anyway?
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@ -52,6 +52,9 @@ class Fetcher():
    instock_data_js = ""
    status_code = None
    webdriver_js_execute_code = None
+    webdriver_enable_pagination = False
+    webdriver_paginated_js_execute_each_page = None
+    webdriver_paginated_next_selector = None
    xpath_data = None
    xpath_element_js = ""

--- a/changedetectionio/content_fetchers/exceptions/init.py
+++ b/changedetectionio/content_fetchers/exceptions/init.py
@ -95,3 +95,8 @@ class ReplyWithContentButNoText(Exception):
        self.html_content = html_content
        self.xpath_data = xpath_data
        return
+
+
+class PaginatedContentMisconfigured(Exception):
+    def __init__(self):
+        return
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@ -5,7 +5,7 @@ from urllib.parse import urlparse
 from loguru import logger

 from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
-from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
+from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, PaginatedContentMisconfigured, ScreenshotUnavailable

 class fetcher(Fetcher):
    fetcher_description = "Playwright {}/Javascript".format(
@ -133,10 +133,13 @@ class fetcher(Fetcher):
                browser.close()
                logger.debug("Content Fetcher > Response object from the browser communication was none")
                raise EmptyReply(url=url, status_code=None)
-
+            
            try:
                if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
-                    browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
+                    if self.webdriver_enable_pagination == True:
+                        self.run_paginated(url=url)
+                    else:
+                        self.run_normal(browsersteps_interface=browsersteps_interface)
            except playwright._impl._errors.TimeoutError as e:
                context.close()
                browser.close()
@ -147,7 +150,7 @@ class fetcher(Fetcher):
                context.close()
                browser.close()
                raise PageUnloadable(url=url, status_code=None, message=str(e))
-
+            
            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
            self.page.wait_for_timeout(extra_wait * 1000)

@ -209,3 +212,60 @@ class fetcher(Fetcher):
            finally:
                context.close()
                browser.close()
+
+    def run_normal(self, browsersteps_interface):
+        """
+        Run normal content extraction.
+        """
+        browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
+
+    def run_paginated(self, url):
+        """
+        Run paginated content extraction in the following order:
+        1. Execute initial JS code after the page is loaded
+        2.
+            a. Execute JS code to extract content from the page\n
+            b. Look for a "next page" button and click it if it exists\n
+            c. Repeat step 2 until the "next page" button is not found
+        3. Write the extracted content to a hidden input element with ID "cd_data"
+        """
+        if self.webdriver_paginated_js_execute_each_page is None or not len(self.webdriver_paginated_js_execute_each_page) \
+            or self.webdriver_paginated_next_selector is None or not len(self.webdriver_paginated_next_selector):
+            raise PaginatedContentMisconfigured()
+
+        from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
+        from playwright._impl._errors import TimeoutError
+
+        browsersteps_interface = steppable_browser_interface(start_url=url)
+        browsersteps_interface.page = self.page
+        
+        browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
+        browsersteps_interface.action_wait_for_load_state(selector=None)
+
+        data = ""
+        step_n = 1
+        while True:
+            if data != "":
+                data += ","
+
+            logger.debug(f"Paginated content > Page {step_n}")
+            data += browsersteps_interface.action_execute_js(value=self.webdriver_paginated_js_execute_each_page, selector=None)
+
+            try:
+                next_button = browsersteps_interface.get_locator(self.webdriver_paginated_next_selector)
+                next_button.wait_for()
+                next_button.click()
+                browsersteps_interface.action_wait_for_load_state(selector=None)
+                step_n += 1
+            except TimeoutError:
+                # This just means the button could not be found.
+                logger.debug(f"Paginated content > Next button could not be found")
+                break
+
+        self.page.evaluate('''(data) => {
+            const el = document.createElement('input');
+            el.id = 'cd_data';
+            el.type = 'hidden';
+            el.value = data;
+            document.body.appendChild(el);
+        }''', data)
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -494,7 +494,12 @@ class processor_text_json_diff_form(commonSettingsForm):
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
        browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
    text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()])
+
    webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])
+    
+    webdriver_enable_pagination = BooleanField('Enable paginated mode', default=False)
+    webdriver_paginated_js_execute_each_page = TextAreaField('(Paginated) Execute JavaScript on each page', render_kw={"rows": "5"}, validators=[validators.Optional()])
+    webdriver_paginated_next_selector = TextAreaField('(Paginated) Next page button selector', validators=[validators.Optional()])

    save_button = SubmitField('Save', render_kw={"class": "pure-button button-small pure-button-primary"})

--- a/changedetectionio/model/init.py
+++ b/changedetectionio/model/init.py
@ -68,6 +68,9 @@ class watch_base(dict):
            'uuid': str(uuid.uuid4()),
            'webdriver_delay': None,
            'webdriver_js_execute_code': None,  # Run before change-detection
+            'webdriver_enable_pagination': False,  # Run before change-detection
+            'webdriver_paginated_js_execute_each_page': None,  # Run before change-detection
+            'webdriver_paginated_next_selector': None,  # Run before change-detection
        })

        super(watch_base, self).__init__(*arg, **kw)
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@ -141,6 +141,12 @@ class difference_detection_processor():
        if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip():
            self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code')

+        self.fetcher.webdriver_enable_pagination = self.watch.get('webdriver_enable_pagination', False)
+        if self.watch.get('webdriver_paginated_js_execute_each_page') is not None and self.watch.get('webdriver_paginated_js_execute_each_page').strip():
+            self.fetcher.webdriver_paginated_js_execute_each_page = self.watch.get('webdriver_paginated_js_execute_each_page')
+        if self.watch.get('webdriver_paginated_next_selector') is not None and self.watch.get('webdriver_paginated_next_selector').strip():
+            self.fetcher.webdriver_paginated_next_selector = self.watch.get('webdriver_paginated_next_selector')
+
        # Requests for PDF's, images etc should be passwd the is_binary flag
        is_binary = self.watch.is_pdf

--- a/changedetectionio/static/js/vis.js
+++ b/changedetectionio/static/js/vis.js
@ -2,6 +2,7 @@ $(document).ready(function () {

    // Lazy Hide/Show elements mechanism
    $('[data-visible-for]').hide();
+
    function show_related_elem(e) {
        var n = $(e).attr('name') + "=" + $(e).val();
        if (n === 'fetch_backend=system') {
@ -9,16 +10,31 @@ $(document).ready(function () {
        }
        $(`[data-visible-for~="${n}"]`).show();
    }
+
+    function toggle_related_elem(e) {
+        var n = $(e).attr('name') + "=" + $(e).val();
+        if (n === 'fetch_backend=system') {
+            n = "fetch_backend=" + default_system_fetch_backend;
+        }
+        console.log('n: ' + n);
+        $(`[data-visible-for~="${n}"]`).toggle();
+    }
+
    $(':radio').on('keyup keypress blur change click', function (e) {
        $(`[data-visible-for]`).hide();
        $('.advanced-options').hide();
        show_related_elem(this);
    });
-
    $(':radio:checked').each(function (e) {
       show_related_elem(this);
-    })
+    });

+    $(':checkbox').on('change', function (e) {
+        toggle_related_elem(this);
+    });
+    $(':checkbox:checked').each(function (e) {
+        show_related_elem(this);
+    });

    // Show advanced
    $('.show-advanced').click(function (e) {
--- a/changedetectionio/static/styles/scss/styles.scss
+++ b/changedetectionio/static/styles/scss/styles.scss
@ -591,6 +591,10 @@ footer {
  .pure-controls {
    padding-bottom: 1em;

+    &.spacing-top {
+      padding-top: 1rem;
+    }
+
    div {
      margin: 0px;
    }
@ -609,6 +613,10 @@ footer {
    legend {
      color: var(--color-text-legend);
    }
+
+    pre {
+      margin-bottom: 0;
+    }
  }

  /* The input fields with errors */
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@ -925,6 +925,10 @@ footer {
  .pure-form .pure-group,
  .pure-form .pure-controls {
    padding-bottom: 1em; }
+    .pure-form .pure-control-group.spacing-top,
+    .pure-form .pure-group.spacing-top,
+    .pure-form .pure-controls.spacing-top {
+      padding-top: 1rem; }
    .pure-form .pure-control-group div,
    .pure-form .pure-group div,
    .pure-form .pure-controls div {
@ -942,6 +946,10 @@ footer {
    .pure-form .pure-group legend,
    .pure-form .pure-controls legend {
      color: var(--color-text-legend); }
+    .pure-form .pure-control-group pre,
+    .pure-form .pure-group pre,
+    .pure-form .pure-controls pre {
+      margin-bottom: 0; }
  .pure-form .error input {
    background-color: var(--color-error-input); }
  .pure-form ul.errors {
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -310,6 +310,9 @@ class ChangeDetectionStore:
                    'trigger_text',
                    'url',
                    'webdriver_js_execute_code',
+                    'webdriver_enable_pagination',
+                    'webdriver_paginated_js_execute_each_page',
+                    'webdriver_paginated_next_selector',
                ]:
                    if res.get(k):
                        if k != 'css_filter':
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -137,6 +137,27 @@
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More
                            help and examples here</a>
                        </div>
+
+                        <div class="pure-control-group spacing-top">
+                            {{ render_checkbox_field(form.webdriver_enable_pagination) }}
+                            <div class="pure-form-message-inline">
+                                Running in pagination mode will extract data from each page, and inject it on the last in the following element:
+                                <pre>&lt;input type="hidden" id="cd_data" value="&lt;extracted data here&gt;"&gt;</pre>
+                            </div>
+                        </div>
+
+                        <div data-visible-for="webdriver_enable_pagination=y" style="display: none;">
+                            {{ render_field(form.webdriver_paginated_js_execute_each_page) }}
+                            <div class="pure-form-message-inline">
+                                This code will be executed on each page to extract the data.
+                            </div>
+                            
+                            {{ render_field(form.webdriver_paginated_next_selector) }}
+                            <div class="pure-form-message-inline">
+                                This selector defines the element that should be clicked to navigate to the next page. If this element cannot bet found,
+                                the watch will interpret it as the last page.
+                            </div>
+                        </div>
                    </div>
                </fieldset>
                <!-- html requests always -->
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -462,6 +462,16 @@ class update_worker(threading.Thread):
                        if e.message:
                            err_text = "{} - {}".format(err_text, e.message)

+                        if e.screenshot:
+                            watch.save_screenshot(screenshot=e.screenshot, as_error=True)
+
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
+                                                                           'last_check_status': e.status_code,
+                                                                           'has_ldjson_price_data': None})
+                        process_changedetection_results = False
+                    except content_fetchers_exceptions.PaginatedContentMisconfigured as e:
+                        err_text = "Paginated content fetching is not configured properly. Did you fill in all fields?"
+
                        if e.screenshot:
                            watch.save_screenshot(screenshot=e.screenshot, as_error=True)