From 19f3851c9ddf44ebf25bbe637271b3d7456aaf66 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 11 Sep 2024 16:20:49 +0200 Subject: [PATCH] Memory management improvements - LXML and other libraries can leak allocation, wrap in a sub-process (#2626) --- .../processors/restock_diff/processor.py | 14 +++++-- .../processors/text_json_diff/processor.py | 38 +++++++++++++------ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 1a3a96ca..3d8e4349 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -2,8 +2,7 @@ from .. import difference_detection_processor from ..exceptions import ProcessorException from . import Restock from loguru import logger -import hashlib -import re + import urllib3 import time @@ -36,6 +35,7 @@ def get_itemprop_availability(html_content) -> Restock: """ from jsonpath_ng import parse + import re now = time.time() import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") @@ -122,6 +122,10 @@ class perform_site_check(difference_detection_processor): xpath_data = None def run_changedetection(self, watch, skip_when_checksum_same=True): + import hashlib + + from concurrent.futures import ProcessPoolExecutor + from functools import partial if not watch: raise Exception("Watch no longer exists.") @@ -149,7 +153,11 @@ class perform_site_check(difference_detection_processor): itemprop_availability = {} try: - itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content) + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments + # anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(get_itemprop_availability, self.fetcher.content)) + itemprop_availability = future.result() except MoreThanOnePriceFound as e: # Add the real data raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.", diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 1de5bafb..77c37131 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError): class perform_site_check(difference_detection_processor): def run_changedetection(self, watch, skip_when_checksum_same=True): + from concurrent.futures import ProcessPoolExecutor + from functools import partial + changed_detected = False html_content = "" screenshot = False # as bytes @@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor): for filter_rule in include_filters_rule: # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): - html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''), html_content=self.fetcher.content, append_pretty_line_formatting=not watch.is_source_type_url, - is_rss=is_rss) + is_rss=is_rss)) + html_content += future.result() + elif filter_rule.startswith('xpath1:'): - html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''), + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''), html_content=self.fetcher.content, append_pretty_line_formatting=not watch.is_source_type_url, - is_rss=is_rss) + is_rss=is_rss)) + html_content += future.result() else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content += html_tools.include_filters(include_filters=filter_rule, + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule, html_content=self.fetcher.content, - append_pretty_line_formatting=not watch.is_source_type_url) + append_pretty_line_formatting=not watch.is_source_type_url)) + html_content += future.result() if not html_content.strip(): raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data) @@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor): else: # extract text do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False) - stripped_text_from_html = \ - html_tools.html_to_text( - html_content=html_content, + with ProcessPoolExecutor() as executor: + # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + future = executor.submit(partial(html_tools.html_to_text, html_content=html_content, render_anchor_tag_content=do_anchor, - is_rss=is_rss # #1874 activate the something</p> will add an extra line feed to signify the paragraph gap