From 19f3851c9ddf44ebf25bbe637271b3d7456aaf66 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Wed, 11 Sep 2024 16:20:49 +0200
Subject: [PATCH] Memory management improvements - LXML and other libraries can
 leak allocation, wrap in a sub-process (#2626)

---
 .../processors/restock_diff/processor.py      | 14 +++++--
 .../processors/text_json_diff/processor.py    | 38 +++++++++++++------
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py
index 1a3a96ca..3d8e4349 100644
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -2,8 +2,7 @@ from .. import difference_detection_processor
 from ..exceptions import ProcessorException
 from . import Restock
 from loguru import logger
-import hashlib
-import re
+
 import urllib3
 import time
 
@@ -36,6 +35,7 @@ def get_itemprop_availability(html_content) -> Restock:
     """
     from jsonpath_ng import parse
 
+    import re
     now = time.time()
     import extruct
     logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
@@ -122,6 +122,10 @@ class perform_site_check(difference_detection_processor):
     xpath_data = None
 
     def run_changedetection(self, watch, skip_when_checksum_same=True):
+        import hashlib
+
+        from concurrent.futures import ProcessPoolExecutor
+        from functools import partial
         if not watch:
             raise Exception("Watch no longer exists.")
 
@@ -149,7 +153,11 @@ class perform_site_check(difference_detection_processor):
 
         itemprop_availability = {}
         try:
-            itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content)
+            with ProcessPoolExecutor() as executor:
+                # Use functools.partial to create a callable with arguments
+                # anything using bs4/lxml etc is quite "leaky"
+                future = executor.submit(partial(get_itemprop_availability, self.fetcher.content))
+                itemprop_availability = future.result()
         except MoreThanOnePriceFound as e:
             # Add the real data
             raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 1de5bafb..77c37131 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError):
 class perform_site_check(difference_detection_processor):
 
     def run_changedetection(self, watch, skip_when_checksum_same=True):
+        from concurrent.futures import ProcessPoolExecutor
+        from functools import partial
+
         changed_detected = False
         html_content = ""
         screenshot = False  # as bytes
@@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor):
                     for filter_rule in include_filters_rule:
                         # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
                         if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
-                            html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
+                            with ProcessPoolExecutor() as executor:
+                                # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
+                                future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=is_rss))
+                                html_content += future.result()
+
                         elif filter_rule.startswith('xpath1:'):
-                            html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
+                            with ProcessPoolExecutor() as executor:
+                                # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
+                                future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=is_rss))
+                                html_content += future.result()
                         else:
-                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
-                            html_content += html_tools.include_filters(include_filters=filter_rule,
+                            with ProcessPoolExecutor() as executor:
+                                # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
+                                # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                                future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule,
                                                                        html_content=self.fetcher.content,
-                                                                       append_pretty_line_formatting=not watch.is_source_type_url)
+                                                                       append_pretty_line_formatting=not watch.is_source_type_url))
+                                html_content += future.result()
 
                     if not html_content.strip():
                         raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
@@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor):
                 else:
                     # extract text
                     do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
-                    stripped_text_from_html = \
-                        html_tools.html_to_text(
-                            html_content=html_content,
+                    with ProcessPoolExecutor() as executor:
+                        # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
+                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                        future = executor.submit(partial(html_tools.html_to_text, html_content=html_content,
                             render_anchor_tag_content=do_anchor,
-                            is_rss=is_rss # #1874 activate the <title workaround hack
-                        )
+                            is_rss=is_rss)) #1874 activate the <title workaround hack
+                        stripped_text_from_html = future.result()
 
         if watch.get('sort_text_alphabetically') and stripped_text_from_html:
             # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap