From e16814e40b7e16f1fb86f7d6fdb818e3ddc0b0ff Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 11 Sep 2024 11:31:07 +0200 Subject: [PATCH 1/3] Testing - locale fix for test (#2623) --- changedetectionio/tests/test_restock_itemprop.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/tests/test_restock_itemprop.py b/changedetectionio/tests/test_restock_itemprop.py index e9fd6a39..c873aa22 100644 --- a/changedetectionio/tests/test_restock_itemprop.py +++ b/changedetectionio/tests/test_restock_itemprop.py @@ -198,7 +198,8 @@ def _run_test_minmax_limit(client, extra_watch_edit_form): client.get(url_for("form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) res = client.get(url_for("index")) - assert b'1,890.45' or b'1890.45' in res.data + # Depending on the LOCALE it may be either of these (generally for US/default/etc) + assert b'1,890.45' in res.data or b'1890.45' in res.data assert b'unviewed' in res.data res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) From 7f2fa20318182495a1992c6041529f52bf53b4c9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 11 Sep 2024 14:51:32 +0200 Subject: [PATCH 2/3] Small memory allocation fixes (#2625) --- changedetectionio/apprise/__init__.py | 79 +++++++++++++++++ .../content_fetchers/requests.py | 5 +- changedetectionio/flask_app.py | 3 +- changedetectionio/forms.py | 3 +- changedetectionio/html_tools.py | 17 ++-- changedetectionio/notification.py | 85 ++----------------- changedetectionio/store.py | 2 +- 7 files changed, 103 insertions(+), 91 deletions(-) create mode 100644 changedetectionio/apprise/__init__.py diff --git a/changedetectionio/apprise/__init__.py b/changedetectionio/apprise/__init__.py new file mode 100644 index 00000000..130b1322 --- /dev/null +++ b/changedetectionio/apprise/__init__.py @@ -0,0 +1,79 @@ + +# include the decorator +from apprise.decorators import notify + +@notify(on="delete") +@notify(on="deletes") +@notify(on="get") +@notify(on="gets") +@notify(on="post") +@notify(on="posts") +@notify(on="put") +@notify(on="puts") +def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs): + import requests + import json + from apprise.utils import parse_url as apprise_parse_url + from apprise import URLBase + + url = kwargs['meta'].get('url') + + if url.startswith('post'): + r = requests.post + elif url.startswith('get'): + r = requests.get + elif url.startswith('put'): + r = requests.put + elif url.startswith('delete'): + r = requests.delete + + url = url.replace('post://', 'http://') + url = url.replace('posts://', 'https://') + url = url.replace('put://', 'http://') + url = url.replace('puts://', 'https://') + url = url.replace('get://', 'http://') + url = url.replace('gets://', 'https://') + url = url.replace('put://', 'http://') + url = url.replace('puts://', 'https://') + url = url.replace('delete://', 'http://') + url = url.replace('deletes://', 'https://') + + headers = {} + params = {} + auth = None + + # Convert /foobar?+some-header=hello to proper header dictionary + results = apprise_parse_url(url) + if results: + # Add our headers that the user can potentially over-ride if they wish + # to to our returned result set and tidy entries by unquoting them + headers = {URLBase.unquote(x): URLBase.unquote(y) + for x, y in results['qsd+'].items()} + + # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation + # In Apprise, it relies on prefixing each request arg with "-", because it uses say &method=update as a flag for apprise + # but here we are making straight requests, so we need todo convert this against apprise's logic + for k, v in results['qsd'].items(): + if not k.strip('+-') in results['qsd+'].keys(): + params[URLBase.unquote(k)] = URLBase.unquote(v) + + # Determine Authentication + auth = '' + if results.get('user') and results.get('password'): + auth = (URLBase.unquote(results.get('user')), URLBase.unquote(results.get('user'))) + elif results.get('user'): + auth = (URLBase.unquote(results.get('user'))) + + # Try to auto-guess if it's JSON + try: + json.loads(body) + headers['Content-Type'] = 'application/json; charset=utf-8' + except ValueError as e: + pass + + r(results.get('url'), + auth=auth, + data=body.encode('utf-8') if type(body) is str else body, + headers=headers, + params=params + ) \ No newline at end of file diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index c39b2636..149d7f96 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -1,8 +1,6 @@ from loguru import logger -import chardet import hashlib import os -import requests from changedetectionio import strtobool from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived from changedetectionio.content_fetchers.base import Fetcher @@ -28,6 +26,9 @@ class fetcher(Fetcher): is_binary=False, empty_pages_are_a_change=False): + import chardet + import requests + if self.browser_steps_get_valid_steps(): raise BrowserStepsInUnsupportedFetcher(url=url) diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index fd12393a..6324b58b 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -537,7 +537,8 @@ def changedetection_app(config=None, datastore_o=None): import random from .apprise_asset import asset apobj = apprise.Apprise(asset=asset) - + # so that the custom endpoints are registered + from changedetectionio.apprise import apprise_custom_api_call_wrapper is_global_settings_form = request.args.get('mode', '') == 'global-settings' is_group_settings_form = request.args.get('mode', '') == 'group-settings' diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index b0b19f99..ce2841de 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -221,7 +221,8 @@ class ValidateAppRiseServers(object): def __call__(self, form, field): import apprise apobj = apprise.Apprise() - + # so that the custom endpoints are registered + from changedetectionio.apprise import apprise_custom_api_call_wrapper for server_url in field.data: if not apobj.add(server_url): message = field.gettext('\'%s\' is not a valid AppRise URL.' % (server_url)) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index bd5fdb8f..ffe00cd0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -1,10 +1,4 @@ - -from bs4 import BeautifulSoup -from inscriptis import get_text -from jsonpath_ng.ext import parse from typing import List -from inscriptis.model.config import ParserConfig -from xml.sax.saxutils import escape as xml_escape import json import re @@ -39,6 +33,7 @@ def perl_style_slash_enclosed_regex_to_options(regex): # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches def include_filters(include_filters, html_content, append_pretty_line_formatting=False): + from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") html_block = "" r = soup.select(include_filters, separator="") @@ -56,6 +51,7 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting return html_block def subtractive_css_selector(css_selector, html_content): + from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") for item in soup.select(css_selector): item.decompose() @@ -181,6 +177,7 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals # Extract/find element def extract_element(find='title', html_content=''): + from bs4 import BeautifulSoup #Re #106, be sure to handle when its not found element_text = None @@ -194,6 +191,8 @@ def extract_element(find='title', html_content=''): # def _parse_json(json_data, json_filter): + from jsonpath_ng.ext import parse + if json_filter.startswith("json:"): jsonpath_expression = parse(json_filter.replace('json:', '')) match = jsonpath_expression.find(json_data) @@ -242,6 +241,8 @@ def _get_stripped_text_from_json_match(match): # json_filter - ie json:$..price # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector) def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): + from bs4 import BeautifulSoup + stripped_text_from_html = False # https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags @@ -352,6 +353,7 @@ def strip_ignore_text(content, wordlist, mode="content"): return "\n".encode('utf8').join(output) def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: + from xml.sax.saxutils import escape as xml_escape pattern = ')\s*)*)\]\]>' def repl(m): text = m.group(1) @@ -360,6 +362,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False return re.sub(pattern, repl, html_content) def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str: + from inscriptis import get_text + from inscriptis.model.config import ParserConfig + """Converts html string to a string with just the text. If ignoring rendering anchor tag content is enable, anchor tag content are also included in the text diff --git a/changedetectionio/notification.py b/changedetectionio/notification.py index d685ab1d..54e682ae 100644 --- a/changedetectionio/notification.py +++ b/changedetectionio/notification.py @@ -1,9 +1,10 @@ -import apprise + import time from apprise import NotifyFormat -import json +import apprise from loguru import logger + valid_tokens = { 'base_url': '', 'current_snapshot': '', @@ -34,87 +35,11 @@ valid_notification_formats = { default_notification_format_for_watch: default_notification_format_for_watch } -# include the decorator -from apprise.decorators import notify - -@notify(on="delete") -@notify(on="deletes") -@notify(on="get") -@notify(on="gets") -@notify(on="post") -@notify(on="posts") -@notify(on="put") -@notify(on="puts") -def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs): - import requests - from apprise.utils import parse_url as apprise_parse_url - from apprise import URLBase - - url = kwargs['meta'].get('url') - - if url.startswith('post'): - r = requests.post - elif url.startswith('get'): - r = requests.get - elif url.startswith('put'): - r = requests.put - elif url.startswith('delete'): - r = requests.delete - - url = url.replace('post://', 'http://') - url = url.replace('posts://', 'https://') - url = url.replace('put://', 'http://') - url = url.replace('puts://', 'https://') - url = url.replace('get://', 'http://') - url = url.replace('gets://', 'https://') - url = url.replace('put://', 'http://') - url = url.replace('puts://', 'https://') - url = url.replace('delete://', 'http://') - url = url.replace('deletes://', 'https://') - - headers = {} - params = {} - auth = None - - # Convert /foobar?+some-header=hello to proper header dictionary - results = apprise_parse_url(url) - if results: - # Add our headers that the user can potentially over-ride if they wish - # to to our returned result set and tidy entries by unquoting them - headers = {URLBase.unquote(x): URLBase.unquote(y) - for x, y in results['qsd+'].items()} - - # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation - # In Apprise, it relies on prefixing each request arg with "-", because it uses say &method=update as a flag for apprise - # but here we are making straight requests, so we need todo convert this against apprise's logic - for k, v in results['qsd'].items(): - if not k.strip('+-') in results['qsd+'].keys(): - params[URLBase.unquote(k)] = URLBase.unquote(v) - - # Determine Authentication - auth = '' - if results.get('user') and results.get('password'): - auth = (URLBase.unquote(results.get('user')), URLBase.unquote(results.get('user'))) - elif results.get('user'): - auth = (URLBase.unquote(results.get('user'))) - - # Try to auto-guess if it's JSON - try: - json.loads(body) - headers['Content-Type'] = 'application/json; charset=utf-8' - except ValueError as e: - pass - - r(results.get('url'), - auth=auth, - data=body.encode('utf-8') if type(body) is str else body, - headers=headers, - params=params - ) def process_notification(n_object, datastore): - + # so that the custom endpoints are registered + from changedetectionio.apprise import apprise_custom_api_call_wrapper from .safe_jinja import render as jinja_render now = time.time() if n_object.get('notification_timestamp'): diff --git a/changedetectionio/store.py b/changedetectionio/store.py index c3772557..cc1b335f 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -11,7 +11,6 @@ from threading import Lock import json import os import re -import requests import secrets import threading import time @@ -270,6 +269,7 @@ class ChangeDetectionStore: self.needs_write_urgent = True def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now=True): + import requests if extras is None: extras = {} From 19f3851c9ddf44ebf25bbe637271b3d7456aaf66 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 11 Sep 2024 16:20:49 +0200 Subject: [PATCH 3/3] Memory management improvements - LXML and other libraries can leak allocation, wrap in a sub-process (#2626) --- .../processors/restock_diff/processor.py | 14 +++++-- .../processors/text_json_diff/processor.py | 38 +++++++++++++------ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 1a3a96ca..3d8e4349 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -2,8 +2,7 @@ from .. import difference_detection_processor from ..exceptions import ProcessorException from . import Restock from loguru import logger -import hashlib -import re + import urllib3 import time @@ -36,6 +35,7 @@ def get_itemprop_availability(html_content) -> Restock: """ from jsonpath_ng import parse + import re now = time.time() import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") @@ -122,6 +122,10 @@ class perform_site_check(difference_detection_processor): xpath_data = None def run_changedetection(self, watch, skip_when_checksum_same=True): + import hashlib + + from concurrent.futures import ProcessPoolExecutor + from functools import partial if not watch: raise Exception("Watch no longer exists.") @@ -149,7 +153,11 @@ class perform_site_check(difference_detection_processor): itemprop_availability = {} try: - itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content) + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments + # anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(get_itemprop_availability, self.fetcher.content)) + itemprop_availability = future.result() except MoreThanOnePriceFound as e: # Add the real data raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.", diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 1de5bafb..77c37131 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError): class perform_site_check(difference_detection_processor): def run_changedetection(self, watch, skip_when_checksum_same=True): + from concurrent.futures import ProcessPoolExecutor + from functools import partial + changed_detected = False html_content = "" screenshot = False # as bytes @@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor): for filter_rule in include_filters_rule: # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): - html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''), html_content=self.fetcher.content, append_pretty_line_formatting=not watch.is_source_type_url, - is_rss=is_rss) + is_rss=is_rss)) + html_content += future.result() + elif filter_rule.startswith('xpath1:'): - html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''), + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''), html_content=self.fetcher.content, append_pretty_line_formatting=not watch.is_source_type_url, - is_rss=is_rss) + is_rss=is_rss)) + html_content += future.result() else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content += html_tools.include_filters(include_filters=filter_rule, + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule, html_content=self.fetcher.content, - append_pretty_line_formatting=not watch.is_source_type_url) + append_pretty_line_formatting=not watch.is_source_type_url)) + html_content += future.result() if not html_content.strip(): raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data) @@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor): else: # extract text do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False) - stripped_text_from_html = \ - html_tools.html_to_text( - html_content=html_content, + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + future = executor.submit(partial(html_tools.html_to_text, html_content=html_content, render_anchor_tag_content=do_anchor, - is_rss=is_rss # #1874 activate the something</p> will add an extra line feed to signify the paragraph gap