Merge branch 'master' into update-apprise-1.9.0

3 months ago · f5b7043aae
parent 3a0c992f1a 19f3851c9d
commit f5b7043aae
9 changed files with 140 additions and 106 deletions
--- a/changedetectionio/apprise/init.py
+++ b/changedetectionio/apprise/init.py
@ -0,0 +1,79 @@
 # include the decorator
 from apprise.decorators import notify
@notify(on="delete")
@notify(on="deletes")
@notify(on="get")
@notify(on="gets")
@notify(on="post")
@notify(on="posts")
@notify(on="put")
@notify(on="puts")
 def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs):
    import requests
    import json
    from apprise.utils import parse_url as apprise_parse_url
    from apprise import URLBase
    url = kwargs['meta'].get('url')
    if url.startswith('post'):
        r = requests.post
    elif url.startswith('get'):
        r = requests.get
    elif url.startswith('put'):
        r = requests.put
    elif url.startswith('delete'):
        r = requests.delete
    url = url.replace('post://', 'http://')
    url = url.replace('posts://', 'https://')
    url = url.replace('put://', 'http://')
    url = url.replace('puts://', 'https://')
    url = url.replace('get://', 'http://')
    url = url.replace('gets://', 'https://')
    url = url.replace('put://', 'http://')
    url = url.replace('puts://', 'https://')
    url = url.replace('delete://', 'http://')
    url = url.replace('deletes://', 'https://')
    headers = {}
    params = {}
    auth = None
    # Convert /foobar?+some-header=hello to proper header dictionary
    results = apprise_parse_url(url)
    if results:
        # Add our headers that the user can potentially over-ride if they wish
        # to to our returned result set and tidy entries by unquoting them
        headers = {URLBase.unquote(x): URLBase.unquote(y)
                   for x, y in results['qsd+'].items()}
        # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
        # In Apprise, it relies on prefixing each request arg with "-", because it uses say &method=update as a flag for apprise
        # but here we are making straight requests, so we need todo convert this against apprise's logic
        for k, v in results['qsd'].items():
            if not k.strip('+-') in results['qsd+'].keys():
                params[URLBase.unquote(k)] = URLBase.unquote(v)
        # Determine Authentication
        auth = ''
        if results.get('user') and results.get('password'):
            auth = (URLBase.unquote(results.get('user')), URLBase.unquote(results.get('user')))
        elif results.get('user'):
            auth = (URLBase.unquote(results.get('user')))
    # Try to auto-guess if it's JSON
    try:
        json.loads(body)
        headers['Content-Type'] = 'application/json; charset=utf-8'
    except ValueError as e:
        pass
    r(results.get('url'),
      auth=auth,
      data=body.encode('utf-8') if type(body) is str else body,
      headers=headers,
      params=params
      )
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@ -1,8 +1,6 @@
 from loguru import logger
 import chardet
 import hashlib
 import os
 import requests
 from changedetectionio import strtobool
 from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
 from changedetectionio.content_fetchers.base import Fetcher
@ -28,6 +26,9 @@ class fetcher(Fetcher):
            is_binary=False,
            empty_pages_are_a_change=False):
        import chardet
        import requests
        if self.browser_steps_get_valid_steps():
            raise BrowserStepsInUnsupportedFetcher(url=url)
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@ -537,7 +537,8 @@ def changedetection_app(config=None, datastore_o=None):
        import random
        from .apprise_asset import asset
        apobj = apprise.Apprise(asset=asset)
-
+        # so that the custom endpoints are registered
        from changedetectionio.apprise import apprise_custom_api_call_wrapper
        is_global_settings_form = request.args.get('mode', '') == 'global-settings'
        is_group_settings_form = request.args.get('mode', '') == 'group-settings'
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -221,7 +221,8 @@ class ValidateAppRiseServers(object):
    def __call__(self, form, field):
        import apprise
        apobj = apprise.Apprise()
-
+        # so that the custom endpoints are registered
        from changedetectionio.apprise import apprise_custom_api_call_wrapper
        for server_url in field.data:
            if not apobj.add(server_url):
                message = field.gettext('\'%s\' is not a valid AppRise URL.' % (server_url))
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -1,10 +1,4 @@
 from bs4 import BeautifulSoup
 from inscriptis import get_text
 from jsonpath_ng.ext import parse
 from typing import List
 from inscriptis.model.config import ParserConfig
 from xml.sax.saxutils import escape as xml_escape
 import json
 import re
@ -39,6 +33,7 @@ def perl_style_slash_enclosed_regex_to_options(regex):
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
    r = soup.select(include_filters, separator="")
@ -56,6 +51,7 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting
    return html_block
 def subtractive_css_selector(css_selector, html_content):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    for item in soup.select(css_selector):
        item.decompose()
@ -181,6 +177,7 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
 # Extract/find element
 def extract_element(find='title', html_content=''):
    from bs4 import BeautifulSoup
    #Re #106, be sure to handle when its not found
    element_text = None
@ -194,6 +191,8 @@ def extract_element(find='title', html_content=''):
 #
 def _parse_json(json_data, json_filter):
    from jsonpath_ng.ext import parse
    if json_filter.startswith("json:"):
        jsonpath_expression = parse(json_filter.replace('json:', ''))
        match = jsonpath_expression.find(json_data)
@ -242,6 +241,8 @@ def _get_stripped_text_from_json_match(match):
 # json_filter - ie json:$..price
 # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
 def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    from bs4 import BeautifulSoup
    stripped_text_from_html = False
 # https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
@ -352,6 +353,7 @@ def strip_ignore_text(content, wordlist, mode="content"):
    return "\n".encode('utf8').join(output)
 def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    from xml.sax.saxutils import escape as xml_escape
    pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
    def repl(m):
        text = m.group(1)
@ -360,6 +362,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)
 def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text
--- a/changedetectionio/notification.py
+++ b/changedetectionio/notification.py
@ -1,9 +1,10 @@
-import apprise
+
 import time
 from apprise import NotifyFormat
-import json
+import apprise
 from loguru import logger
 valid_tokens = {
    'base_url': '',
    'current_snapshot': '',
@ -34,87 +35,11 @@ valid_notification_formats = {
    default_notification_format_for_watch: default_notification_format_for_watch
 }
 # include the decorator
 from apprise.decorators import notify
@notify(on="delete")
@notify(on="deletes")
@notify(on="get")
@notify(on="gets")
@notify(on="post")
@notify(on="posts")
@notify(on="put")
@notify(on="puts")
 def apprise_custom_api_call_wrapper(body, title, notify_type, *args, **kwargs):
    import requests
    from apprise.utils import parse_url as apprise_parse_url
    from apprise import URLBase
    url = kwargs['meta'].get('url')
    if url.startswith('post'):
        r = requests.post
    elif url.startswith('get'):
        r = requests.get
    elif url.startswith('put'):
        r = requests.put
    elif url.startswith('delete'):
        r = requests.delete
    url = url.replace('post://', 'http://')
    url = url.replace('posts://', 'https://')
    url = url.replace('put://', 'http://')
    url = url.replace('puts://', 'https://')
    url = url.replace('get://', 'http://')
    url = url.replace('gets://', 'https://')
    url = url.replace('put://', 'http://')
    url = url.replace('puts://', 'https://')
    url = url.replace('delete://', 'http://')
    url = url.replace('deletes://', 'https://')
    headers = {}
    params = {}
    auth = None
    # Convert /foobar?+some-header=hello to proper header dictionary
    results = apprise_parse_url(url)
    if results:
        # Add our headers that the user can potentially over-ride if they wish
        # to to our returned result set and tidy entries by unquoting them
        headers = {URLBase.unquote(x): URLBase.unquote(y)
                   for x, y in results['qsd+'].items()}
        # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
        # In Apprise, it relies on prefixing each request arg with "-", because it uses say &method=update as a flag for apprise
        # but here we are making straight requests, so we need todo convert this against apprise's logic
        for k, v in results['qsd'].items():
            if not k.strip('+-') in results['qsd+'].keys():
                params[URLBase.unquote(k)] = URLBase.unquote(v)
        # Determine Authentication
        auth = ''
        if results.get('user') and results.get('password'):
            auth = (URLBase.unquote(results.get('user')), URLBase.unquote(results.get('user')))
        elif results.get('user'):
            auth = (URLBase.unquote(results.get('user')))
    # Try to auto-guess if it's JSON
    try:
        json.loads(body)
        headers['Content-Type'] = 'application/json; charset=utf-8'
    except ValueError as e:
        pass
    r(results.get('url'),
      auth=auth,
      data=body.encode('utf-8') if type(body) is str else body,
      headers=headers,
      params=params
      )
 def process_notification(n_object, datastore):
-
+    # so that the custom endpoints are registered
    from changedetectionio.apprise import apprise_custom_api_call_wrapper
    from .safe_jinja import render as jinja_render
    now = time.time()
    if n_object.get('notification_timestamp'):
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@ -2,8 +2,7 @@ from .. import difference_detection_processor
 from ..exceptions import ProcessorException
 from . import Restock
 from loguru import logger
-import hashlib
+
 import re
 import urllib3
 import time
@ -36,6 +35,7 @@ def get_itemprop_availability(html_content) -> Restock:
    """
    from jsonpath_ng import parse
    import re
    now = time.time()
    import extruct
    logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
@ -122,6 +122,10 @@ class perform_site_check(difference_detection_processor):
    xpath_data = None
    def run_changedetection(self, watch, skip_when_checksum_same=True):
        import hashlib
        from concurrent.futures import ProcessPoolExecutor
        from functools import partial
        if not watch:
            raise Exception("Watch no longer exists.")
@ -149,7 +153,11 @@ class perform_site_check(difference_detection_processor):
        itemprop_availability = {}
        try:
-            itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content)
+            with ProcessPoolExecutor() as executor:
                # Use functools.partial to create a callable with arguments
                # anything using bs4/lxml etc is quite "leaky"
                future = executor.submit(partial(get_itemprop_availability, self.fetcher.content))
                itemprop_availability = future.result()
        except MoreThanOnePriceFound as e:
            # Add the real data
            raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.",
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@ -36,6 +36,9 @@ class PDFToHTMLToolNotFound(ValueError):
 class perform_site_check(difference_detection_processor):
    def run_changedetection(self, watch, skip_when_checksum_same=True):
        from concurrent.futures import ProcessPoolExecutor
        from functools import partial
        changed_detected = False
        html_content = ""
        screenshot = False  # as bytes
@ -171,20 +174,30 @@ class perform_site_check(difference_detection_processor):
                    for filter_rule in include_filters_rule:
                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
                        if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
-                            html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
+                            with ProcessPoolExecutor() as executor:
                                # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
                                future = executor.submit(partial(html_tools.xpath_filter, xpath_filter=filter_rule.replace('xpath:', ''),
                                                                    html_content=self.fetcher.content,
                                                                    append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=is_rss))
                                html_content += future.result()
                        elif filter_rule.startswith('xpath1:'):
-                            html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
+                            with ProcessPoolExecutor() as executor:
                                # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
                                future = executor.submit(partial(html_tools.xpath1_filter, xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                    html_content=self.fetcher.content,
                                                                    append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=is_rss))
                                html_content += future.result()
                        else:
-                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                            with ProcessPoolExecutor() as executor:
-                            html_content += html_tools.include_filters(include_filters=filter_rule,
+                                # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
                                # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                                future = executor.submit(partial(html_tools.include_filters, include_filters=filter_rule,
                                                                       html_content=self.fetcher.content,
-                                                                       append_pretty_line_formatting=not watch.is_source_type_url)
+                                                                       append_pretty_line_formatting=not watch.is_source_type_url))
                                html_content += future.result()
                    if not html_content.strip():
                        raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot, xpath_data=self.fetcher.xpath_data)
@ -197,12 +210,13 @@ class perform_site_check(difference_detection_processor):
                else:
                    # extract text
                    do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
-                    stripped_text_from_html = \
+                    with ProcessPoolExecutor() as executor:
-                        html_tools.html_to_text(
+                        # Use functools.partial to create a callable with arguments - anything using bs4/lxml etc is quite "leaky"
-                            html_content=html_content,
+                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                        future = executor.submit(partial(html_tools.html_to_text, html_content=html_content,
                            render_anchor_tag_content=do_anchor,
-                            is_rss=is_rss # #1874 activate the <title workaround hack
+                            is_rss=is_rss)) #1874 activate the <title workaround hack
-                        )
+                        stripped_text_from_html = future.result()
        if watch.get('sort_text_alphabetically') and stripped_text_from_html:
            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -11,7 +11,6 @@ from threading import Lock
 import json
 import os
 import re
 import requests
 import secrets
 import threading
 import time
@ -270,6 +269,7 @@ class ChangeDetectionStore:
        self.needs_write_urgent = True
    def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now=True):
        import requests
        if extras is None:
            extras = {}