From 4f536bb5595c233a0c618458b703651f43e0cc12 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Sat, 12 Feb 2022 21:40:35 +0100
Subject: [PATCH] Fix json detect bug

---
 changedetectionio/fetch_site_status.py | 46 +++++++++++++-------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 28dbe394..806325da 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -112,27 +112,28 @@ class perform_site_check():
 
             is_text_or_html = 'text' in update_obj['content-type']
             is_binary = content_fetcher.supported_binary_type(update_obj['content-type'])
-
             css_filter_rule = watch['css_filter']
-
             has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
 
-            # Make it reformat the JSON to something nice
+            # Auto-detect application/json, make it reformat the JSON to something nice
             if is_json and not has_filter_rule:
                 css_filter_rule = "json:$"
                 has_filter_rule = True
 
-            if has_filter_rule:
-                if 'json:' in css_filter_rule:
-                    stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
-                    original_content_before_filters = stripped_text_from_html.encode('utf-8')
-                    is_text_or_html = False
+            ##### CONVERT THE INPUT TO TEXT, EXTRACT THE PARTS THAT NEED TO BE FILTERED
+
+            # Dont depend on the content-type header here, maybe it's not present
+            if 'json:' in css_filter_rule:
+                rule = css_filter_rule.replace('json:', '')
+                stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content,
+                                                                            jsonpath_filter=rule).encode('utf-8')
+                is_text_or_html = False
+                original_content_before_filters = stripped_text_from_html
 
             if is_text_or_html:
                 # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                 html_content = fetcher.content
                 if not fetcher.headers.get('Content-Type', '') == 'text/plain':
-
                     if has_filter_rule:
                         # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
                         if css_filter_rule[0] == '/':
@@ -140,9 +141,14 @@ class perform_site_check():
                         else:
                             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                             html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
-
                     # get_text() via inscriptis
                     stripped_text_from_html = get_text(html_content)
+
+                # Extract title as title
+                if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
+                    if not watch['title'] or not len(watch['title']):
+                        update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
+
                 else:
                     # Don't run get_text or xpath/css filters on plaintext
                     stripped_text_from_html = html_content
@@ -150,14 +156,13 @@ class perform_site_check():
                 # Re #340 - return the content before the 'ignore text' was applied
                 original_content_before_filters = stripped_text_from_html.encode('utf-8')
 
+
             # We rely on the actual text in the html output.. many sites have random script vars etc,
             # in the future we'll implement other mechanisms.
 
             update_obj["last_check_status"] = fetcher.get_last_status_code()
 
-            # If there's text to skip
-            # @todo we could abstract out the get_text() to handle this cleaner
-
+            ######## AFTER FILTERING, STRIP OUT IGNORE TEXT
             if is_text_or_html:
                 text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
                 if len(text_to_ignore):
@@ -165,16 +170,11 @@ class perform_site_check():
                 else:
                     stripped_text_from_html = stripped_text_from_html.encode('utf8')
 
-                # Re #133 - if we should strip whitespaces from triggering the change detected comparison
-                if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
-                    fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
-                else:
-                    fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
-
-                # Extract title as title
-                if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
-                    if not watch['title'] or not len(watch['title']):
-                        update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
+            # Re #133 - if we should strip whitespaces from triggering the change detected comparison
+            if is_text_or_html and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
+                fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
+            else:
+                fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
 
             # Goal here in the future is to be able to abstract out different content type checks into their own class