Reversing subprocess execution - saved a little memory but used a LOT more CPU

reverse-2626-memory
dgtlmoon 4 months ago
parent e173954cdd
commit 192ae8064c

@ -143,8 +143,6 @@ class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
import hashlib import hashlib
from concurrent.futures import ProcessPoolExecutor
from functools import partial
if not watch: if not watch:
raise Exception("Watch no longer exists.") raise Exception("Watch no longer exists.")
@ -186,11 +184,7 @@ class perform_site_check(difference_detection_processor):
itemprop_availability = {} itemprop_availability = {}
try: try:
with ProcessPoolExecutor() as executor: itemprop_availability = get_itemprop_availability(self.fetcher.content)
# Use functools.partial to create a callable with arguments
# anything using bs4/lxml etc is quite "leaky"
future = executor.submit(partial(get_itemprop_availability, self.fetcher.content))
itemprop_availability = future.result()
except MoreThanOnePriceFound as e: except MoreThanOnePriceFound as e:
# Add the real data # Add the real data
raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.", raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",

@ -36,8 +36,6 @@ class PDFToHTMLToolNotFound(ValueError):
class perform_site_check(difference_detection_processor): class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
from concurrent.futures import ProcessPoolExecutor
from functools import partial
changed_detected = False changed_detected = False
html_content = "" html_content = ""
@ -174,30 +172,20 @@ class perform_site_check(difference_detection_processor):
for filter_rule in include_filters_rule: for filter_rule in include_filters_rule:
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.." # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
with ProcessPoolExecutor() as executor: html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''),
html_content=self.fetcher.content, html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url, append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss)) is_rss=is_rss)
html_content += future.result()
elif filter_rule.startswith('xpath1:'): elif filter_rule.startswith('xpath1:'):
with ProcessPoolExecutor() as executor: html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" html_content=self.fetcher.content,
future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''), append_pretty_line_formatting=not watch.is_source_type_url,
html_content=self.fetcher.content, is_rss=is_rss)
append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss))
html_content += future.result()
else: else:
with ProcessPoolExecutor() as executor: html_content += html_tools.include_filters(include_filters=filter_rule,
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule,
html_content=self.fetcher.content, html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url)) append_pretty_line_formatting=not watch.is_source_type_url)
html_content += future.result()
if not html_content.strip(): if not html_content.strip():
raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data) raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
@ -210,13 +198,9 @@ class perform_site_check(difference_detection_processor):
else: else:
# extract text # extract text
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False) do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
with ProcessPoolExecutor() as executor: stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
# Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky" render_anchor_tag_content=do_anchor,
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text is_rss=is_rss) # 1874 activate the <title workaround hack
future = executor.submit(partial(html_tools.html_to_text, html_content=html_content,
render_anchor_tag_content=do_anchor,
is_rss=is_rss)) #1874 activate the <title workaround hack
stripped_text_from_html = future.result()
if watch.get('trim_text_whitespace'): if watch.get('trim_text_whitespace'):

Loading…
Cancel
Save