Memory management improvements - LXML and other libraries can leak allocation, wrap in a sub-process (#2626)

pull/2351/head
dgtlmoon 4 months ago committed by GitHub
parent 7f2fa20318
commit 19f3851c9d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -2,8 +2,7 @@ from .. import difference_detection_processor
from ..exceptions import ProcessorException from ..exceptions import ProcessorException
from . import Restock from . import Restock
from loguru import logger from loguru import logger
import hashlib
import re
import urllib3 import urllib3
import time import time
@ -36,6 +35,7 @@ def get_itemprop_availability(html_content) -> Restock:
""" """
from jsonpath_ng import parse from jsonpath_ng import parse
import re
now = time.time() now = time.time()
import extruct import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
@ -122,6 +122,10 @@ class perform_site_check(difference_detection_processor):
xpath_data = None xpath_data = None
def run_changedetection(self, watch, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
import hashlib
from concurrent.futures import ProcessPoolExecutor
from functools import partial
if not watch: if not watch:
raise Exception("Watch no longer exists.") raise Exception("Watch no longer exists.")
@ -149,7 +153,11 @@ class perform_site_check(difference_detection_processor):
itemprop_availability = {} itemprop_availability = {}
try: try:
itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content) with ProcessPoolExecutor() as executor:
# Use functools.partial to create a callable with arguments
# anything using bs4/lxml etc is quite "leaky"
future = executor.submit(partial(get_itemprop_availability, self.fetcher.content))
itemprop_availability = future.result()
except MoreThanOnePriceFound as e: except MoreThanOnePriceFound as e:
# Add the real data # Add the real data
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.", raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",

@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError):
class perform_site_check(difference_detection_processor): class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
from concurrent.futures import ProcessPoolExecutor
from functools import partial
changed_detected = False changed_detected = False
html_content = "" html_content = ""
screenshot = False # as bytes screenshot = False # as bytes
@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor):
for filter_rule in include_filters_rule: for filter_rule in include_filters_rule:
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.." # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), with ProcessPoolExecutor() as executor:
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''),
html_content=self.fetcher.content, html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url, append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss) is_rss=is_rss))
html_content += future.result()
elif filter_rule.startswith('xpath1:'): elif filter_rule.startswith('xpath1:'):
html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''), with ProcessPoolExecutor() as executor:
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''),
html_content=self.fetcher.content, html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url, append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss) is_rss=is_rss))
html_content += future.result()
else: else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text with ProcessPoolExecutor() as executor:
html_content += html_tools.include_filters(include_filters=filter_rule, # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule,
html_content=self.fetcher.content, html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url) append_pretty_line_formatting=not watch.is_source_type_url))
html_content += future.result()
if not html_content.strip(): if not html_content.strip():
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data) raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor):
else: else:
# extract text # extract text
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False) do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
stripped_text_from_html = \ with ProcessPoolExecutor() as executor:
html_tools.html_to_text( # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
html_content=html_content, # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
future = executor.submit(partial(html_tools.html_to_text, html_content=html_content,
render_anchor_tag_content=do_anchor, render_anchor_tag_content=do_anchor,
is_rss=is_rss # #1874 activate the <title workaround hack is_rss=is_rss)) #1874 activate the <title workaround hack
) stripped_text_from_html = future.result()
if watch.get('sort_text_alphabetically') and stripped_text_from_html: if watch.get('sort_text_alphabetically') and stripped_text_from_html:
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap

Loading…
Cancel
Save