From f215adbbe5e7818353549dab8a14104c87a3c85f Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 23 Jun 2021 20:40:01 +1000 Subject: [PATCH] CSS Filter - Smarter is to just extract the HTML blob and continue with inscriptus, so we have almost the same output as not using the filter --- backend/fetch_site_status.py | 33 +++++++++++++++++------------- backend/tests/test_css_selector.py | 26 +++++++++++++++++++++++ requirements.txt | 2 +- 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index 75a87b32..e0296b45 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -5,6 +5,17 @@ from inscriptis import get_text import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches +class css_filter(object): + def apply(self, css_filter, html_content): + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_content, "html.parser") + html_block = "" + for item in soup.select(css_filter, separator=""): + html_block += str(item) + + return html_block+"\n" + # Some common stuff here that can be moved to a base class class perform_site_check(): @@ -82,21 +93,15 @@ class perform_site_check(): timeout=timeout, verify=False) - # CSS Filter - css_filter = self.datastore.data['watching'][uuid]['css_filter'] - if css_filter and len(css_filter.strip()): - from bs4 import BeautifulSoup - soup = BeautifulSoup(r.content, "html.parser") - stripped_text_from_html = "" - for item in soup.select(css_filter): - # By default, bs4's get_text will lump the text together - # BS4's element strip() will lose the indentation format, I've tried using a space as separator, setting strip=False etc, but doesnt help - # @todo ideas? if you compare the css_filtered output to non-filtered snapshot it will always lose the indentation/format - text = str(item.get_text(separator="\n", strip=True)).strip() + '\n' - stripped_text_from_html += text + html = r.text - else: - stripped_text_from_html = get_text(r.text) + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + css_filter_rule = self.datastore.data['watching'][uuid]['css_filter'] + if css_filter_rule and len(css_filter_rule.strip()): + filter = css_filter() + html = filter.apply(css_filter=css_filter_rule, html_content=r.content) + + stripped_text_from_html = get_text(html) # Usually from networkIO/requests level except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: diff --git a/backend/tests/test_css_selector.py b/backend/tests/test_css_selector.py index a3a0ffc6..40d7e23a 100644 --- a/backend/tests/test_css_selector.py +++ b/backend/tests/test_css_selector.py @@ -43,6 +43,32 @@ def set_modified_response(): return None +# Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's +def test_css_filter_output(): + from backend import fetch_site_status + from inscriptis import get_text + + css_filter = fetch_site_status.css_filter() + + # Check text with sub-parts renders correctly + content = """
Some really bold text
""" + html_blob = css_filter.apply(css_filter="#thingthing", html_content=content) + text = get_text(html_blob) + assert text == " Some really bold text" + + content = """ +

foo bar blah

+
Block A
Block B
+ +""" + html_blob = css_filter.apply(css_filter=".parts", html_content=content) + text = get_text(html_blob) + + # Divs are converted to 4 whitespaces by inscriptis + assert text == " Block A\n Block B" + + +# Tests the whole stack works with the CSS Filter def test_check_markup_css_filter_restriction(client, live_server): sleep_time_for_fetch_thread = 3 diff --git a/requirements.txt b/requirements.txt index 0ba04251..e0e05ddc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,6 @@ wtforms ~= 2.3.3 # Notification library apprise ~= 0.9 -# Used for CSS filtering +# Used for CSS filtering, replace with soupsieve and lxml for xpath bs4