diff --git a/changedetectionio/apprise_plugin/__init__.py b/changedetectionio/apprise_plugin/__init__.py new file mode 100644 index 00000000..93c382fa --- /dev/null +++ b/changedetectionio/apprise_plugin/__init__.py @@ -0,0 +1,78 @@ +# include the decorator +from apprise.decorators import notify + +@notify(on="delete") +@notify(on="deletes") +@notify(on="get") +@notify(on="gets") +@notify(on="post") +@notify(on="posts") +@notify(on="put") +@notify(on="puts") +def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs): + import requests + import json + from apprise.utils import parse_url as apprise_parse_url + from apprise import URLBase + + url = kwargs['meta'].get('url') + + if url.startswith('post'): + r = requests.post + elif url.startswith('get'): + r = requests.get + elif url.startswith('put'): + r = requests.put + elif url.startswith('delete'): + r = requests.delete + + url = url.replace('post://', 'http://') + url = url.replace('posts://', 'https://') + url = url.replace('put://', 'http://') + url = url.replace('puts://', 'https://') + url = url.replace('get://', 'http://') + url = url.replace('gets://', 'https://') + url = url.replace('put://', 'http://') + url = url.replace('puts://', 'https://') + url = url.replace('delete://', 'http://') + url = url.replace('deletes://', 'https://') + + headers = {} + params = {} + auth = None + + # Convert /foobar?+some-header=hello to proper header dictionary + results = apprise_parse_url(url) + if results: + # Add our headers that the user can potentially over-ride if they wish + # to to our returned result set and tidy entries by unquoting them + headers = {URLBase.unquote(x): URLBase.unquote(y) + for x, y in results['qsd+'].items()} + + # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation + # In Apprise, it relies on prefixing each request arg with "-", because it uses say &method=update as a flag for apprise + # but here we are making straight requests, so we need todo convert this against apprise's logic + for k, v in results['qsd'].items(): + if not k.strip('+-') in results['qsd+'].keys(): + params[URLBase.unquote(k)] = URLBase.unquote(v) + + # Determine Authentication + auth = '' + if results.get('user') and results.get('password'): + auth = (URLBase.unquote(results.get('user')), URLBase.unquote(results.get('user'))) + elif results.get('user'): + auth = (URLBase.unquote(results.get('user'))) + + # Try to auto-guess if it's JSON + try: + json.loads(body) + headers['Content-Type'] = 'application/json; charset=utf-8' + except ValueError as e: + pass + + r(results.get('url'), + auth=auth, + data=body.encode('utf-8') if type(body) is str else body, + headers=headers, + params=params + ) \ No newline at end of file diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 6f38be2e..b9765bac 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -25,6 +25,7 @@ browser_step_ui_config = {'Choose one': '0 0', 'Click element if exists': '1 0', 'Click element': '1 0', 'Click element containing text': '0 1', + 'Click element containing text if exists': '0 1', 'Enter text in field': '1 1', 'Execute JS': '0 1', # 'Extract text and use as filter': '1 0', @@ -96,12 +97,24 @@ class steppable_browser_interface(): return self.action_goto_url(value=self.start_url) def action_click_element_containing_text(self, selector=None, value=''): + logger.debug("Clicking element containing text") if not len(value.strip()): return elem = self.page.get_by_text(value) if elem.count(): elem.first.click(delay=randint(200, 500), timeout=3000) + def action_click_element_containing_text_if_exists(self, selector=None, value=''): + logger.debug("Clicking element containing text if exists") + if not len(value.strip()): + return + elem = self.page.get_by_text(value) + logger.debug(f"Clicking element containing text - {elem.count()} elements found") + if elem.count(): + elem.first.click(delay=randint(200, 500), timeout=3000) + else: + return + def action_enter_text_in_field(self, selector, value): if not len(selector.strip()): return diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index c39b2636..149d7f96 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -1,8 +1,6 @@ from loguru import logger -import chardet import hashlib import os -import requests from changedetectionio import strtobool from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived from changedetectionio.content_fetchers.base import Fetcher @@ -28,6 +26,9 @@ class fetcher(Fetcher): is_binary=False, empty_pages_are_a_change=False): + import chardet + import requests + if self.browser_steps_get_valid_steps(): raise BrowserStepsInUnsupportedFetcher(url=url) diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index b6a929ac..14a7570c 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -541,7 +541,8 @@ def changedetection_app(config=None, datastore_o=None): import random from .apprise_asset import asset apobj = apprise.Apprise(asset=asset) - + # so that the custom endpoints are registered + from changedetectionio.apprise_plugin import apprise_custom_api_call_wrapper is_global_settings_form = request.args.get('mode', '') == 'global-settings' is_group_settings_form = request.args.get('mode', '') == 'group-settings' diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index abc2fc4f..f92b854a 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -221,7 +221,8 @@ class ValidateAppRiseServers(object): def __call__(self, form, field): import apprise apobj = apprise.Apprise() - + # so that the custom endpoints are registered + from changedetectionio.apprise_plugin import apprise_custom_api_call_wrapper for server_url in field.data: if not apobj.add(server_url): message = field.gettext('\'%s\' is not a valid AppRise URL.' % (server_url)) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index bd5fdb8f..ffe00cd0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -1,10 +1,4 @@ - -from bs4 import BeautifulSoup -from inscriptis import get_text -from jsonpath_ng.ext import parse from typing import List -from inscriptis.model.config import ParserConfig -from xml.sax.saxutils import escape as xml_escape import json import re @@ -39,6 +33,7 @@ def perl_style_slash_enclosed_regex_to_options(regex): # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches def include_filters(include_filters, html_content, append_pretty_line_formatting=False): + from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") html_block = "" r = soup.select(include_filters, separator="") @@ -56,6 +51,7 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting return html_block def subtractive_css_selector(css_selector, html_content): + from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") for item in soup.select(css_selector): item.decompose() @@ -181,6 +177,7 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals # Extract/find element def extract_element(find='title', html_content=''): + from bs4 import BeautifulSoup #Re #106, be sure to handle when its not found element_text = None @@ -194,6 +191,8 @@ def extract_element(find='title', html_content=''): # def _parse_json(json_data, json_filter): + from jsonpath_ng.ext import parse + if json_filter.startswith("json:"): jsonpath_expression = parse(json_filter.replace('json:', '')) match = jsonpath_expression.find(json_data) @@ -242,6 +241,8 @@ def _get_stripped_text_from_json_match(match): # json_filter - ie json:$..price # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector) def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): + from bs4 import BeautifulSoup + stripped_text_from_html = False # https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags @@ -352,6 +353,7 @@ def strip_ignore_text(content, wordlist, mode="content"): return "\n".encode('utf8').join(output) def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: + from xml.sax.saxutils import escape as xml_escape pattern = ')\s*)*)\]\]>' def repl(m): text = m.group(1) @@ -360,6 +362,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False return re.sub(pattern, repl, html_content) def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str: + from inscriptis import get_text + from inscriptis.model.config import ParserConfig + """Converts html string to a string with just the text. If ignoring rendering anchor tag content is enable, anchor tag content are also included in the text diff --git a/changedetectionio/notification.py b/changedetectionio/notification.py index d685ab1d..a7328857 100644 --- a/changedetectionio/notification.py +++ b/changedetectionio/notification.py @@ -1,9 +1,10 @@ -import apprise + import time from apprise import NotifyFormat -import json +import apprise from loguru import logger + valid_tokens = { 'base_url': '', 'current_snapshot': '', @@ -34,86 +35,11 @@ valid_notification_formats = { default_notification_format_for_watch: default_notification_format_for_watch } -# include the decorator -from apprise.decorators import notify - -@notify(on="delete") -@notify(on="deletes") -@notify(on="get") -@notify(on="gets") -@notify(on="post") -@notify(on="posts") -@notify(on="put") -@notify(on="puts") -def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs): - import requests - from apprise.utils import parse_url as apprise_parse_url - from apprise import URLBase - - url = kwargs['meta'].get('url') - - if url.startswith('post'): - r = requests.post - elif url.startswith('get'): - r = requests.get - elif url.startswith('put'): - r = requests.put - elif url.startswith('delete'): - r = requests.delete - - url = url.replace('post://', 'http://') - url = url.replace('posts://', 'https://') - url = url.replace('put://', 'http://') - url = url.replace('puts://', 'https://') - url = url.replace('get://', 'http://') - url = url.replace('gets://', 'https://') - url = url.replace('put://', 'http://') - url = url.replace('puts://', 'https://') - url = url.replace('delete://', 'http://') - url = url.replace('deletes://', 'https://') - - headers = {} - params = {} - auth = None - - # Convert /foobar?+some-header=hello to proper header dictionary - results = apprise_parse_url(url) - if results: - # Add our headers that the user can potentially over-ride if they wish - # to to our returned result set and tidy entries by unquoting them - headers = {URLBase.unquote(x): URLBase.unquote(y) - for x, y in results['qsd+'].items()} - - # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation - # In Apprise, it relies on prefixing each request arg with "-", because it uses say &method=update as a flag for apprise - # but here we are making straight requests, so we need todo convert this against apprise's logic - for k, v in results['qsd'].items(): - if not k.strip('+-') in results['qsd+'].keys(): - params[URLBase.unquote(k)] = URLBase.unquote(v) - - # Determine Authentication - auth = '' - if results.get('user') and results.get('password'): - auth = (URLBase.unquote(results.get('user')), URLBase.unquote(results.get('user'))) - elif results.get('user'): - auth = (URLBase.unquote(results.get('user'))) - - # Try to auto-guess if it's JSON - try: - json.loads(body) - headers['Content-Type'] = 'application/json; charset=utf-8' - except ValueError as e: - pass - - r(results.get('url'), - auth=auth, - data=body.encode('utf-8') if type(body) is str else body, - headers=headers, - params=params - ) def process_notification(n_object, datastore): + # so that the custom endpoints are registered + from changedetectionio.apprise_plugin import apprise_custom_api_call_wrapper from .safe_jinja import render as jinja_render now = time.time() diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 66503a00..803066c6 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -2,8 +2,7 @@ from .. import difference_detection_processor from ..exceptions import ProcessorException from . import Restock from loguru import logger -import hashlib -import re + import urllib3 import time @@ -27,6 +26,25 @@ def _search_prop_by_value(matches, value): if value in prop[0]: return prop[1] # Yield the desired value and exit the function +def _deduplicate_prices(data): + seen = set() + unique_data = [] + + for datum in data: + # Convert 'value' to float if it can be a numeric string, otherwise leave it as is + try: + normalized_value = float(datum.value) if isinstance(datum.value, str) and datum.value.replace('.', '', 1).isdigit() else datum.value + except ValueError: + normalized_value = datum.value + + # If the normalized value hasn't been seen yet, add it to unique data + if normalized_value not in seen: + unique_data.append(datum) + seen.add(normalized_value) + + return unique_data + + # should return Restock() # add casting? def get_itemprop_availability(html_content) -> Restock: @@ -36,6 +54,7 @@ def get_itemprop_availability(html_content) -> Restock: """ from jsonpath_ng import parse + import re now = time.time() import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") @@ -60,7 +79,7 @@ def get_itemprop_availability(html_content) -> Restock: pricecurrency_parse = parse('$..(pricecurrency|currency|priceCurrency )') availability_parse = parse('$..(availability|Availability)') - price_result = price_parse.find(data) + price_result = _deduplicate_prices(price_parse.find(data)) if price_result: # Right now, we just support single product items, maybe we will store the whole actual metadata seperately in teh future and # parse that for the UI? @@ -122,6 +141,10 @@ class perform_site_check(difference_detection_processor): xpath_data = None def run_changedetection(self, watch, skip_when_checksum_same=True): + import hashlib + + from concurrent.futures import ProcessPoolExecutor + from functools import partial if not watch: raise Exception("Watch no longer exists.") @@ -149,7 +172,11 @@ class perform_site_check(difference_detection_processor): itemprop_availability = {} try: - itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content) + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments + # anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(get_itemprop_availability, self.fetcher.content)) + itemprop_availability = future.result() except MoreThanOnePriceFound as e: # Add the real data raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.", diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 115cb6b1..28f86473 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError): class perform_site_check(difference_detection_processor): def run_changedetection(self, watch, skip_when_checksum_same=True): + from concurrent.futures import ProcessPoolExecutor + from functools import partial + changed_detected = False html_content = "" screenshot = False # as bytes @@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor): for filter_rule in include_filters_rule: # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): - html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''), html_content=self.fetcher.content, append_pretty_line_formatting=not watch.is_source_type_url, - is_rss=is_rss) + is_rss=is_rss)) + html_content += future.result() + elif filter_rule.startswith('xpath1:'): - html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''), + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''), html_content=self.fetcher.content, append_pretty_line_formatting=not watch.is_source_type_url, - is_rss=is_rss) + is_rss=is_rss)) + html_content += future.result() else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content += html_tools.include_filters(include_filters=filter_rule, + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule, html_content=self.fetcher.content, - append_pretty_line_formatting=not watch.is_source_type_url) + append_pretty_line_formatting=not watch.is_source_type_url)) + html_content += future.result() if not html_content.strip(): raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data) @@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor): else: # extract text do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False) - stripped_text_from_html = \ - html_tools.html_to_text( - html_content=html_content, + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + future = executor.submit(partial(html_tools.html_to_text, html_content=html_content, render_anchor_tag_content=do_anchor, - is_rss=is_rss # #1874 activate the