From f215adbbe5e7818353549dab8a14104c87a3c85f Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Wed, 23 Jun 2021 20:40:01 +1000
Subject: [PATCH] CSS Filter - Smarter is to just extract the HTML blob and
 continue with inscriptus, so we have almost the same output as not using the
 filter

---
 backend/fetch_site_status.py       | 33 +++++++++++++++++-------------
 backend/tests/test_css_selector.py | 26 +++++++++++++++++++++++
 requirements.txt                   |  2 +-
 3 files changed, 46 insertions(+), 15 deletions(-)
diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py
index 75a87b32..e0296b45 100644
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@@ -5,6 +5,17 @@ from inscriptis import get_text
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
+# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
+class css_filter(object):
+    def apply(self, css_filter, html_content):
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html_content, "html.parser")
+        html_block = ""
+        for item in soup.select(css_filter, separator=""):
+            html_block += str(item)
+
+        return html_block+"\n"
+
 # Some common stuff here that can be moved to a base class
 class perform_site_check():
 
@@ -82,21 +93,15 @@ class perform_site_check():
                              timeout=timeout,
                              verify=False)
 
-            # CSS Filter
-            css_filter = self.datastore.data['watching'][uuid]['css_filter']
-            if css_filter and len(css_filter.strip()):
-                from bs4 import BeautifulSoup
-                soup = BeautifulSoup(r.content, "html.parser")
-                stripped_text_from_html = ""
-                for item in soup.select(css_filter):
-                    # By default, bs4's get_text will lump the text together
-                    # BS4's element strip() will lose the indentation format, I've tried using a space as separator, setting strip=False etc, but doesnt help
-                    # @todo ideas? if you compare the css_filtered output to non-filtered snapshot it will always lose the indentation/format
-                    text = str(item.get_text(separator="\n", strip=True)).strip() + '\n'
-                    stripped_text_from_html += text
+            html = r.text
 
-            else:
-                stripped_text_from_html = get_text(r.text)
+            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
+            if css_filter_rule and len(css_filter_rule.strip()):
+                filter = css_filter()
+                html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
+
+            stripped_text_from_html = get_text(html)
 
         # Usually from networkIO/requests level
         except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
diff --git a/backend/tests/test_css_selector.py b/backend/tests/test_css_selector.py
index a3a0ffc6..40d7e23a 100644
--- a/backend/tests/test_css_selector.py
+++ b/backend/tests/test_css_selector.py
@@ -43,6 +43,32 @@ def set_modified_response():
     return None
 
 
+# Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's
+def test_css_filter_output():
+    from backend import fetch_site_status
+    from inscriptis import get_text
+
+    css_filter = fetch_site_status.css_filter()
+
+    # Check text with sub-parts renders correctly
+    content = """<html> <body><div id="thingthing" >  Some really <b>bold</b> text  </div> </body> </html>"""
+    html_blob = css_filter.apply(css_filter="#thingthing", html_content=content)
+    text = get_text(html_blob)
+    assert text == "  Some really bold text"
+
+    content = """<html> <body>
+    <p>foo bar blah</p>
+    <div class="parts">Block A</div> <div class="parts">Block B</div></body> 
+    </html>
+"""
+    html_blob = css_filter.apply(css_filter=".parts", html_content=content)
+    text = get_text(html_blob)
+
+    # Divs are converted to 4 whitespaces by inscriptis
+    assert text == "    Block A\n    Block B"
+
+
+# Tests the whole stack works with the CSS Filter
 def test_check_markup_css_filter_restriction(client, live_server):
     sleep_time_for_fetch_thread = 3
 
diff --git a/requirements.txt b/requirements.txt
index 0ba04251..e0e05ddc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,6 @@ wtforms ~= 2.3.3
 # Notification library
 apprise ~= 0.9
 
-# Used for CSS filtering
+# Used for CSS filtering, replace with soupsieve and lxml for xpath
 bs4