From 2a9fb12451eaafa5e8482c60f3b362d4944d5393 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 19 Apr 2022 23:15:32 +0200 Subject: [PATCH] Import speed improvements, and adding an import URL batch size of 5,000 to stop accidental CPU overload (#549) --- changedetectionio/__init__.py | 13 ++++++++++--- changedetectionio/model/Watch.py | 4 +++- changedetectionio/store.py | 13 ++++++------- changedetectionio/update_worker.py | 2 +- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 3e0f384f..4f58fdcb 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -661,13 +661,19 @@ def changedetection_app(config=None, datastore_o=None): good = 0 if request.method == 'POST': + now=time.time() urls = request.values.get('urls').split("\n") + + if (len(urls) > 5000): + flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.") + for url in urls: url = url.strip() url, *tags = url.split(" ") # Flask wtform validators wont work with basic auth, use validators package - if len(url) and validators.url(url.replace('source:', '')): - new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags)) + # Up to 5000 per batch so we dont flood the server + if len(url) and validators.url(url.replace('source:', '')) and good < 5000: + new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False) # Straight into the queue. update_q.put(new_uuid) good += 1 @@ -675,7 +681,8 @@ def changedetection_app(config=None, datastore_o=None): if len(url): remaining_urls.append(url) - flash("{} Imported, {} Skipped.".format(good, len(remaining_urls))) + flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls))) + datastore.needs_write = True if len(remaining_urls) == 0: # Looking good, redirect to index. diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index e97ee91c..b86b930e 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -13,7 +13,6 @@ from changedetectionio.notification import ( class model(dict): def __init__(self, *arg, **kw): - super(model, self).__init__(*arg, **kw) self.update({ 'url': None, 'tag': None, @@ -45,6 +44,9 @@ class model(dict): # Should be all None by default, so we use the system default in this case. 'minutes_between_check': None }) + # goes at the end so we update the default object with the initialiser + super(model, self).__init__(*arg, **kw) + @property def has_empty_checktime(self): diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 6f9d69e4..9aeebeeb 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -272,15 +272,14 @@ class ChangeDetectionStore: self.needs_write = True return changes_removed - def add_watch(self, url, tag="", extras=None): + def add_watch(self, url, tag="", extras=None, write_to_disk_now=True): if extras is None: extras = {} with self.lock: # @todo use a common generic version of this new_uuid = str(uuid_builder.uuid4()) - _blank = deepcopy(self.generic_definition) - _blank.update({ + new_watch = Watch.model({ 'url': url, 'tag': tag }) @@ -291,9 +290,8 @@ class ChangeDetectionStore: if k in apply_extras: del apply_extras[k] - _blank.update(apply_extras) - - self.data['watching'][new_uuid] = _blank + new_watch.update(apply_extras) + self.__data['watching'][new_uuid]=new_watch # Get the directory ready output_path = "{}/{}".format(self.datastore_path, new_uuid) @@ -302,7 +300,8 @@ class ChangeDetectionStore: except FileExistsError: print(output_path, "already exists.") - self.sync_to_json() + if write_to_disk_now: + self.sync_to_json() return new_uuid # Save some text file to the appropriate path and bump the history diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 51e65424..600cd232 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -2,6 +2,7 @@ import threading import queue import time +from changedetectionio import content_fetcher # A single update worker # # Requests for checking on a single site(watch) from a queue of watches @@ -32,7 +33,6 @@ class update_worker(threading.Thread): else: self.current_uuid = uuid - from changedetectionio import content_fetcher if uuid in list(self.datastore.data['watching'].keys()):