feat(playwright): add functionality to fetch paginated content

pull/2780/head
Danique Wijnalda 2 months ago
parent 0f0f5af7b5
commit c528f66914
No known key found for this signature in database

@ -173,6 +173,9 @@ class steppable_browser_interface():
v = json.dumps(value)
self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=30000)
def action_wait_for_load_state(self, selector, value='networkidle'):
self.page.wait_for_load_state(value)
# @todo - in the future make some popout interface to capture what needs to be set
# https://playwright.dev/python/docs/api/class-keyboard
def action_press_enter(self, selector, value):
@ -190,6 +193,9 @@ class steppable_browser_interface():
def action_uncheck_checkbox(self, selector, value):
self.page.locator(selector, timeout=1000).uncheck(timeout=1000)
def get_locator(self, selector):
return self.page.locator(selector)
# Responsible for maintaining a live 'context' with the chrome CDP
# @todo - how long do contexts live for anyway?

@ -52,6 +52,9 @@ class Fetcher():
instock_data_js = ""
status_code = None
webdriver_js_execute_code = None
webdriver_enable_pagination = False
webdriver_paginated_js_execute_each_page = None
webdriver_paginated_next_selector = None
xpath_data = None
xpath_element_js = ""

@ -95,3 +95,8 @@ class ReplyWithContentButNoText(Exception):
self.html_content = html_content
self.xpath_data = xpath_data
return
class PaginatedContentMisconfigured(Exception):
def __init__(self):
return

@ -5,7 +5,7 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, PaginatedContentMisconfigured, ScreenshotUnavailable
class fetcher(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
@ -133,10 +133,13 @@ class fetcher(Fetcher):
browser.close()
logger.debug("Content Fetcher > Response object from the browser communication was none")
raise EmptyReply(url=url, status_code=None)
try:
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
if self.webdriver_enable_pagination == True:
self.run_paginated(url=url)
else:
self.run_normal(browsersteps_interface=browsersteps_interface)
except playwright._impl._errors.TimeoutError as e:
context.close()
browser.close()
@ -147,7 +150,7 @@ class fetcher(Fetcher):
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000)
@ -209,3 +212,60 @@ class fetcher(Fetcher):
finally:
context.close()
browser.close()
def run_normal(self, browsersteps_interface):
"""
Run normal content extraction.
"""
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
def run_paginated(self, url):
"""
Run paginated content extraction in the following order:
1. Execute initial JS code after the page is loaded
2.
a. Execute JS code to extract content from the page\n
b. Look for a "next page" button and click it if it exists\n
c. Repeat step 2 until the "next page" button is not found
3. Write the extracted content to a hidden input element with ID "cd_data"
"""
if self.webdriver_paginated_js_execute_each_page is None or not len(self.webdriver_paginated_js_execute_each_page) \
or self.webdriver_paginated_next_selector is None or not len(self.webdriver_paginated_next_selector):
raise PaginatedContentMisconfigured()
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
from playwright._impl._errors import TimeoutError
browsersteps_interface = steppable_browser_interface(start_url=url)
browsersteps_interface.page = self.page
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
browsersteps_interface.action_wait_for_load_state(selector=None)
data = ""
step_n = 1
while True:
if data != "":
data += ","
logger.debug(f"Paginated content > Page {step_n}")
data += browsersteps_interface.action_execute_js(value=self.webdriver_paginated_js_execute_each_page, selector=None)
try:
next_button = browsersteps_interface.get_locator(self.webdriver_paginated_next_selector)
next_button.wait_for()
next_button.click()
browsersteps_interface.action_wait_for_load_state(selector=None)
step_n += 1
except TimeoutError:
# This just means the button could not be found.
logger.debug(f"Paginated content > Next button could not be found")
break
self.page.evaluate('''(data) => {
const el = document.createElement('input');
el.id = 'cd_data';
el.type = 'hidden';
el.value = data;
document.body.appendChild(el);
}''', data)

@ -494,7 +494,12 @@ class processor_text_json_diff_form(commonSettingsForm):
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()])
webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])
webdriver_enable_pagination = BooleanField('Enable paginated mode', default=False)
webdriver_paginated_js_execute_each_page = TextAreaField('(Paginated) Execute JavaScript on each page', render_kw={"rows": "5"}, validators=[validators.Optional()])
webdriver_paginated_next_selector = TextAreaField('(Paginated) Next page button selector', validators=[validators.Optional()])
save_button = SubmitField('Save', render_kw={"class": "pure-button button-small pure-button-primary"})

@ -68,6 +68,9 @@ class watch_base(dict):
'uuid': str(uuid.uuid4()),
'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection
'webdriver_enable_pagination': False, # Run before change-detection
'webdriver_paginated_js_execute_each_page': None, # Run before change-detection
'webdriver_paginated_next_selector': None, # Run before change-detection
})
super(watch_base, self).__init__(*arg, **kw)

@ -141,6 +141,12 @@ class difference_detection_processor():
if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip():
self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code')
self.fetcher.webdriver_enable_pagination = self.watch.get('webdriver_enable_pagination', False)
if self.watch.get('webdriver_paginated_js_execute_each_page') is not None and self.watch.get('webdriver_paginated_js_execute_each_page').strip():
self.fetcher.webdriver_paginated_js_execute_each_page = self.watch.get('webdriver_paginated_js_execute_each_page')
if self.watch.get('webdriver_paginated_next_selector') is not None and self.watch.get('webdriver_paginated_next_selector').strip():
self.fetcher.webdriver_paginated_next_selector = self.watch.get('webdriver_paginated_next_selector')
# Requests for PDF's, images etc should be passwd the is_binary flag
is_binary = self.watch.is_pdf

@ -2,6 +2,7 @@ $(document).ready(function () {
// Lazy Hide/Show elements mechanism
$('[data-visible-for]').hide();
function show_related_elem(e) {
var n = $(e).attr('name') + "=" + $(e).val();
if (n === 'fetch_backend=system') {
@ -9,16 +10,31 @@ $(document).ready(function () {
}
$(`[data-visible-for~="${n}"]`).show();
}
function toggle_related_elem(e) {
var n = $(e).attr('name') + "=" + $(e).val();
if (n === 'fetch_backend=system') {
n = "fetch_backend=" + default_system_fetch_backend;
}
console.log('n: ' + n);
$(`[data-visible-for~="${n}"]`).toggle();
}
$(':radio').on('keyup keypress blur change click', function (e) {
$(`[data-visible-for]`).hide();
$('.advanced-options').hide();
show_related_elem(this);
});
$(':radio:checked').each(function (e) {
show_related_elem(this);
})
});
$(':checkbox').on('change', function (e) {
toggle_related_elem(this);
});
$(':checkbox:checked').each(function (e) {
show_related_elem(this);
});
// Show advanced
$('.show-advanced').click(function (e) {

@ -591,6 +591,10 @@ footer {
.pure-controls {
padding-bottom: 1em;
&.spacing-top {
padding-top: 1rem;
}
div {
margin: 0px;
}
@ -609,6 +613,10 @@ footer {
legend {
color: var(--color-text-legend);
}
pre {
margin-bottom: 0;
}
}
/* The input fields with errors */

@ -925,6 +925,10 @@ footer {
.pure-form .pure-group,
.pure-form .pure-controls {
padding-bottom: 1em; }
.pure-form .pure-control-group.spacing-top,
.pure-form .pure-group.spacing-top,
.pure-form .pure-controls.spacing-top {
padding-top: 1rem; }
.pure-form .pure-control-group div,
.pure-form .pure-group div,
.pure-form .pure-controls div {
@ -942,6 +946,10 @@ footer {
.pure-form .pure-group legend,
.pure-form .pure-controls legend {
color: var(--color-text-legend); }
.pure-form .pure-control-group pre,
.pure-form .pure-group pre,
.pure-form .pure-controls pre {
margin-bottom: 0; }
.pure-form .error input {
background-color: var(--color-error-input); }
.pure-form ul.errors {

@ -310,6 +310,9 @@ class ChangeDetectionStore:
'trigger_text',
'url',
'webdriver_js_execute_code',
'webdriver_enable_pagination',
'webdriver_paginated_js_execute_each_page',
'webdriver_paginated_next_selector',
]:
if res.get(k):
if k != 'css_filter':

@ -137,6 +137,27 @@
href="https://github.com/dgtlmoon/changedetection.io/wiki/Run-JavaScript-before-change-detection">More
help and examples here</a>
</div>
<div class="pure-control-group spacing-top">
{{ render_checkbox_field(form.webdriver_enable_pagination) }}
<div class="pure-form-message-inline">
Running in pagination mode will extract data from each page, and inject it on the last in the following element:
<pre>&lt;input type="hidden" id="cd_data" value="&lt;extracted data here&gt;"&gt;</pre>
</div>
</div>
<div data-visible-for="webdriver_enable_pagination=y" style="display: none;">
{{ render_field(form.webdriver_paginated_js_execute_each_page) }}
<div class="pure-form-message-inline">
This code will be executed on each page to extract the data.
</div>
{{ render_field(form.webdriver_paginated_next_selector) }}
<div class="pure-form-message-inline">
This selector defines the element that should be clicked to navigate to the next page. If this element cannot bet found,
the watch will interpret it as the last page.
</div>
</div>
</div>
</fieldset>
<!-- html requests always -->

@ -462,6 +462,16 @@ class update_worker(threading.Thread):
if e.message:
err_text = "{} - {}".format(err_text, e.message)
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except content_fetchers_exceptions.PaginatedContentMisconfigured as e:
err_text = "Paginated content fetching is not configured properly. Did you fill in all fields?"
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)

Loading…
Cancel
Save