Import speed improvements, and adding an import URL batch size of 5,000 to stop accidental CPU overload (#549)

3 years ago · 2a9fb12451
parent 6c3c5dc28a
commit 2a9fb12451
4 changed files with 20 additions and 12 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -661,13 +661,19 @@ def changedetection_app(config=None, datastore_o=None):
        good = 0

        if request.method == 'POST':
+            now=time.time()
            urls = request.values.get('urls').split("\n")
+
+            if (len(urls) > 5000):
+                flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
+
            for url in urls:
                url = url.strip()
                url, *tags = url.split(" ")
                # Flask wtform validators wont work with basic auth, use validators package
-                if len(url) and validators.url(url.replace('source:', '')):
-                    new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags))
+                # Up to 5000 per batch so we dont flood the server
+                if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
+                    new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
                    # Straight into the queue.
                    update_q.put(new_uuid)
                    good += 1
@ -675,7 +681,8 @@ def changedetection_app(config=None, datastore_o=None):
                    if len(url):
                        remaining_urls.append(url)

-            flash("{} Imported, {} Skipped.".format(good, len(remaining_urls)))
+            flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
+            datastore.needs_write = True

            if len(remaining_urls) == 0:
                # Looking good, redirect to index.
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -13,7 +13,6 @@ from changedetectionio.notification import (

 class model(dict):
    def __init__(self, *arg, **kw):
-        super(model, self).__init__(*arg, **kw)
        self.update({
            'url': None,
            'tag': None,
@ -45,6 +44,9 @@ class model(dict):
            # Should be all None by default, so we use the system default in this case.
            'minutes_between_check': None
        })
+        # goes at the end so we update the default object with the initialiser
+        super(model, self).__init__(*arg, **kw)
+

    @property
    def has_empty_checktime(self):
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -272,15 +272,14 @@ class ChangeDetectionStore:
        self.needs_write = True
        return changes_removed

-    def add_watch(self, url, tag="", extras=None):
+    def add_watch(self, url, tag="", extras=None, write_to_disk_now=True):
        if extras is None:
            extras = {}

        with self.lock:
            # @todo use a common generic version of this
            new_uuid = str(uuid_builder.uuid4())
-            _blank = deepcopy(self.generic_definition)
-            _blank.update({
+            new_watch = Watch.model({
                'url': url,
                'tag': tag
            })
@ -291,9 +290,8 @@ class ChangeDetectionStore:
                if k in apply_extras:
                    del apply_extras[k]

-            _blank.update(apply_extras)
-
-            self.data['watching'][new_uuid] = _blank
+            new_watch.update(apply_extras)
+            self.__data['watching'][new_uuid]=new_watch

        # Get the directory ready
        output_path = "{}/{}".format(self.datastore_path, new_uuid)
@ -302,7 +300,8 @@ class ChangeDetectionStore:
        except FileExistsError:
            print(output_path, "already exists.")

-        self.sync_to_json()
+        if write_to_disk_now:
+            self.sync_to_json()
        return new_uuid

    # Save some text file to the appropriate path and bump the history
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -2,6 +2,7 @@ import threading
 import queue
 import time

+from changedetectionio import content_fetcher
 # A single update worker
 #
 # Requests for checking on a single site(watch) from a queue of watches
@ -32,7 +33,6 @@ class update_worker(threading.Thread):

            else:
                self.current_uuid = uuid
-                from changedetectionio import content_fetcher

                if uuid in list(self.datastore.data['watching'].keys()):