changedetection.io/changedetectionio/html_tools.py


from bs4 import BeautifulSoup
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
from jsonpath_ng.ext import parse
from typing import List
import json
import re

# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"

# 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"

class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
        
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
    r = soup.select(include_filters, separator="")

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        html_block += str(element)

    return html_block

def subtractive_css_selector(css_selector, html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    for item in soup.select(css_selector):
        item.decompose()
    return str(soup)


def element_removal(selectors: List[str], html_content):
    """Joins individual filters into one css filter."""
    selector = ",".join(selectors)
    return subtractive_css_selector(selector, html_content)


# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False):
    from lxml import etree, html

    tree = html.fromstring(bytes(html_content, encoding='utf-8'))
    html_block = ""

    r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
    #@note: //title/text() wont work where <title>CDATA..

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        if type(element) == etree._ElementStringResult:
            html_block += str(element)
        elif type(element) == etree._ElementUnicodeResult:
            html_block += str(element)
        else:
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')

    return html_block


# Extract/find element
def extract_element(find='title', html_content=''):

    #Re #106, be sure to handle when its not found
    element_text = None

    soup = BeautifulSoup(html_content, 'html.parser')
    result = soup.find(find)
    if result and result.string:
        element_text = result.string.strip()

    return element_text

#
def _parse_json(json_data, json_filter):
    if 'json:' in json_filter:
        jsonpath_expression = parse(json_filter.replace('json:', ''))
        match = jsonpath_expression.find(json_data)
        return _get_stripped_text_from_json_match(match)

    if 'jq:' in json_filter:

        try:
            import jq
        except ModuleNotFoundError:
            # `jq` requires full compilation in windows and so isn't generally available
            raise Exception("jq not support not found")

        jq_expression = jq.compile(json_filter.replace('jq:', ''))
        match = jq_expression.input(json_data).all()

        return _get_stripped_text_from_json_match(match)

def _get_stripped_text_from_json_match(match):
    s = []
    # More than one result, we will return it as a JSON list.
    if len(match) > 1:
        for i in match:
            s.append(i.value if hasattr(i, 'value') else i)

    # Single value, use just the value, as it could be later used in a token in notifications.
    if len(match) == 1:
        s = match[0].value if hasattr(match[0], 'value') else match[0]

    # Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
    if not match:
        # Re 265 - Just return an empty string when filter not found
        return ''

    # Ticket #462 - allow the original encoding through, usually it's UTF-8 or similar
    stripped_text_from_html = json.dumps(s, indent=4, ensure_ascii=False)

    return stripped_text_from_html

# content - json
# json_filter - ie json:$..price
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    stripped_text_from_html = False

    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
    try:
        stripped_text_from_html = _parse_json(json.loads(content), json_filter)
    except json.JSONDecodeError:

        # Foreach <script json></script> blob.. just return the first that matches json_filter
        s = []
        soup = BeautifulSoup(content, 'html.parser')

        if ensure_is_ldjson_info_type:
            bs_result = soup.findAll('script', {"type": "application/ld+json"})
        else:
            bs_result = soup.findAll('script')


        if not bs_result:
            raise JSONNotFound("No parsable JSON found in this document")

        for result in bs_result:
            # Skip empty tags, and things that dont even look like JSON
            if not result.string or not '{' in result.string:
                continue
                
            try:
                json_data = json.loads(result.string)
            except json.JSONDecodeError:
                # Just skip it
                continue
            else:
                stripped_text_from_html = _parse_json(json_data, json_filter)
                if ensure_is_ldjson_info_type:
                    # Could sometimes be list, string or something else random
                    if isinstance(json_data, dict):
                        # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
                        # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
                        if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
                            break
                elif stripped_text_from_html:
                    break

    if not stripped_text_from_html:
        # Re 265 - Just return an empty string when filter not found
        return ''

    return stripped_text_from_html

# Mode     - "content" return the content without the matches (default)
#          - "line numbers" return a list of line numbers that match (int list)
#
# wordlist - list of regex's (str) or words (str)
def strip_ignore_text(content, wordlist, mode="content"):
    ignore = []
    ignore_regex = []

    # @todo check this runs case insensitive
    for k in wordlist:

        # Is it a regex?
        if k[0] == '/':
            ignore_regex.append(k.strip(" /"))
        else:
            ignore.append(k)

    i = 0
    output = []
    ignored_line_numbers = []
    for line in content.splitlines():
        i += 1
        # Always ignore blank lines in this mode. (when this function gets called)
        if len(line.strip()):
            regex_matches = False

            # if any of these match, skip
            for regex in ignore_regex:
                try:
                    if re.search(regex, line, re.IGNORECASE):
                        regex_matches = True
                except Exception as e:
                    continue

            if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
                output.append(line.encode('utf8'))
            else:
                ignored_line_numbers.append(i)


    # Used for finding out what to highlight
    if mode == "line numbers":
        return ignored_line_numbers

    return "\n".encode('utf8').join(output)


def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text

    :param html_content: string with html content
    :param render_anchor_tag_content: boolean flag indicating whether to extract
    hyperlinks (the anchor tag content) together with text. This refers to the
    'href' inside 'a' tags.
    Anchor tag content is rendered in the following manner:
    '[ text ](anchor tag content)'
    :return: extracted text from the HTML
    """
    #  if anchor tag content flag is set to True define a config for
    #  extracting this content
    if render_anchor_tag_content:

        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]}, display_links=True
        )

    # otherwise set config to None
    else:
        parser_config = None

    # get text and annotations via inscriptis
    text_content = get_text(html_content, config=parser_config)

    return text_content


# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
    try:
        pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
    except JSONNotFound as e:
        # Totally fine
        return False
    x=bool(pricing_data)
    return x


def workarounds_for_obfuscations(content):
    """
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
    This could go into its own Pip package in the future, for faster updates
    """

    # HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
    # https://github.com/weblyzard/inscriptis/issues/45
    if not content:
        return content

    content = re.sub('<!--\s+-->', '', content)

    return content
Add filter to remove elements by CSS rule from HTML before change detection is run (#445) 3 years ago
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`from bs4 import BeautifulSoup`
Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes 3 years ago			`from inscriptis import get_text`
			`from inscriptis.model.config import ParserConfig`
Windows - JQ - Make library optional so it doesnt break Windows pip installs (#1009) 2 years ago			`from jsonpath_ng.ext import parse`
			`from typing import List`
			`import json`
			`import re`
Add filter to remove elements by CSS rule from HTML before change detection is run (#445) 3 years ago
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis`
Fixing bad linebreak definition `</br>` in notifications and UI (#1465) 2 years ago			`TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
Automatically offer to track LD+JSON product price data (#1204) 2 years ago			`# 'price' , 'lowPrice', 'highPrice' are usually under here`
			`# all of those may or may not appear on different websites`
			`LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"`

Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`class JSONNotFound(ValueError):`
			`def __init__(self, msg):`
			`ValueError.__init__(self, msg)`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`def include_filters(include_filters, html_content, append_pretty_line_formatting=False):`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`soup = BeautifulSoup(html_content, "html.parser")`
			`html_block = ""`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`r = soup.select(include_filters, separator="")`

			`for element in r:`
			`# When there's more than 1 match, then add the suffix to separate each line`
			`# And where the matched result doesn't include something that will cause Inscriptis to add a newline`
			`# (This way each 'match' reliably has a new-line in the diff)`
			`# Divs are converted to 4 whitespaces by inscriptis`
			`if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']):`
			`html_block += TEXT_FILTER_LIST_LINE_SUFFIX`

			`html_block += str(element)`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`return html_block`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
Add filter to remove elements by CSS rule from HTML before change detection is run (#445) 3 years ago			`def subtractive_css_selector(css_selector, html_content):`
			`soup = BeautifulSoup(html_content, "html.parser")`
			`for item in soup.select(css_selector):`
			`item.decompose()`
			`return str(soup)`

Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes 3 years ago
Add filter to remove elements by CSS rule from HTML before change detection is run (#445) 3 years ago			`def element_removal(selectors: List[str], html_content):`
			`"""Joins individual filters into one css filter."""`
			`selector = ",".join(selectors)`
			`return subtractive_css_selector(selector, html_content)`
Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes 3 years ago
XPath support (#355) * XPath support and minor improvements to form validation 3 years ago
			`# Return str Utf-8 of matched rules`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False):`
Add filter to remove elements by CSS rule from HTML before change detection is run (#445) 3 years ago			`from lxml import etree, html`
XPath support (#355) * XPath support and minor improvements to form validation 3 years ago
Fix encoding errors with XPath filters from UTF-8 responses (#619) 3 years ago			`tree = html.fromstring(bytes(html_content, encoding='utf-8'))`
XPath support (#355) * XPath support and minor improvements to form validation 3 years ago			`html_block = ""`

Send notification when CSS/xPath filter is missing after more than 6 (configurable) attempts (#771) 2 years ago			`r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})`
Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778) 2 years ago			`#@note: //title/text() wont work where <title>CDATA..`

			`for element in r:`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`# When there's more than 1 match, then add the suffix to separate each line`
			`# And where the matched result doesn't include something that will cause Inscriptis to add a newline`
			`# (This way each 'match' reliably has a new-line in the diff)`
			`# Divs are converted to 4 whitespaces by inscriptis`
			`if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):`
			`html_block += TEXT_FILTER_LIST_LINE_SUFFIX`

Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778) 2 years ago			`if type(element) == etree._ElementStringResult:`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`html_block += str(element)`
Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778) 2 years ago			`elif type(element) == etree._ElementUnicodeResult:`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`html_block += str(element)`
Enhancement - support xPath text() function filter, for example "//title/text()" in RSS feeds (#778) 2 years ago			`else:`
Filters can now accept a list/multiple filters (#1064) #623 2 years ago			`html_block += etree.tostring(element, pretty_print=True).decode('utf-8')`
XPath support (#355) * XPath support and minor improvements to form validation 3 years ago
			`return html_block`


Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago			`# Extract/find element`
			`def extract_element(find='title', html_content=''):`
Re #106 - handling empty title with gettr cleanup (#107) 4 years ago
			`#Re #106, be sure to handle when its not found`
			`element_text = None`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
			`soup = BeautifulSoup(html_content, 'html.parser')`
Re #106 - handling empty title with gettr cleanup (#107) 4 years ago			`result = soup.find(find)`
			`if result and result.string:`
			`element_text = result.string.strip()`

			`return element_text`
Auto extract html title as title (#102) * Auto extract <title> as watch title, Minor refactor for html tooling 4 years ago
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`#`
Adds support for jq JSON path querying engine (#1001) 2 years ago			`def _parse_json(json_data, json_filter):`
			`if 'json:' in json_filter:`
			`jsonpath_expression = parse(json_filter.replace('json:', ''))`
			`match = jsonpath_expression.find(json_data)`
			`return _get_stripped_text_from_json_match(match)`
Windows - JQ - Make library optional so it doesnt break Windows pip installs (#1009) 2 years ago
Adds support for jq JSON path querying engine (#1001) 2 years ago			`if 'jq:' in json_filter:`
Windows - JQ - Make library optional so it doesnt break Windows pip installs (#1009) 2 years ago
			`try:`
			`import jq`
			`except ModuleNotFoundError:`
			# `jq` requires full compilation in windows and so isn't generally available
			`raise Exception("jq not support not found")`

Adds support for jq JSON path querying engine (#1001) 2 years ago			`jq_expression = jq.compile(json_filter.replace('jq:', ''))`
			`match = jq_expression.input(json_data).all()`
Windows - JQ - Make library optional so it doesnt break Windows pip installs (#1009) 2 years ago
Adds support for jq JSON path querying engine (#1001) 2 years ago			`return _get_stripped_text_from_json_match(match)`

			`def _get_stripped_text_from_json_match(match):`
			`s = []`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`# More than one result, we will return it as a JSON list.`
			`if len(match) > 1:`
			`for i in match:`
Adds support for jq JSON path querying engine (#1001) 2 years ago			`s.append(i.value if hasattr(i, 'value') else i)`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago
			`# Single value, use just the value, as it could be later used in a token in notifications.`
			`if len(match) == 1:`
Adds support for jq JSON path querying engine (#1001) 2 years ago			`s = match[0].value if hasattr(match[0], 'value') else match[0]`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago
Re #257 - Handle bool val of json path better (#263) * Re #257 - Handle bool val of json path better, with test 3 years ago			`# Re #257 - Better handling where it does not exist, in the case the original 's' value was False..`
			`if not match:`
Re #265 - extended jsonpath support (#266) * Re #265 - Use extended JSONpath support, Allow a JSONPath selector to not match anything (yet) Adding test Correctly capture invalid JSONPath query error 3 years ago			`# Re 265 - Just return an empty string when filter not found`
			`return ''`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago
JSON diff/preview should use utf-8 encoding where possible (#465) 3 years ago			`# Ticket #462 - allow the original encoding through, usually it's UTF-8 or similar`
			`stripped_text_from_html = json.dumps(s, indent=4, ensure_ascii=False)`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago
			`return stripped_text_from_html`

Automatically offer to track LD+JSON product price data (#1204) 2 years ago			`# content - json`
			`# json_filter - ie json:$..price`
			`# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)`
			`def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`stripped_text_from_html = False`

			`# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>`
			`try:`
Adds support for jq JSON path querying engine (#1001) 2 years ago			`stripped_text_from_html = _parse_json(json.loads(content), json_filter)`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`except json.JSONDecodeError:`

Adds support for jq JSON path querying engine (#1001) 2 years ago			`# Foreach <script json></script> blob.. just return the first that matches json_filter`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`s = []`
			`soup = BeautifulSoup(content, 'html.parser')`
Automatically offer to track LD+JSON product price data (#1204) 2 years ago
			`if ensure_is_ldjson_info_type:`
			`bs_result = soup.findAll('script', {"type": "application/ld+json"})`
			`else:`
			`bs_result = soup.findAll('script')`

Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago
			`if not bs_result:`
			`raise JSONNotFound("No parsable JSON found in this document")`

			`for result in bs_result:`
Re #154 - Handle missing JSON better 3 years ago			`# Skip empty tags, and things that dont even look like JSON`
			`if not result.string or not '{' in result.string:`
			`continue`

Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`try:`
			`json_data = json.loads(result.string)`
			`except json.JSONDecodeError:`
			`# Just skip it`
			`continue`
			`else:`
Adds support for jq JSON path querying engine (#1001) 2 years ago			`stripped_text_from_html = _parse_json(json_data, json_filter)`
Automatically offer to track LD+JSON product price data (#1204) 2 years ago			`if ensure_is_ldjson_info_type:`
			`# Could sometimes be list, string or something else random`
			`if isinstance(json_data, dict):`
			`# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search`
			`# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)`
			`if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:`
			`break`
			`elif stripped_text_from_html:`
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`break`

Re #154 - Handle missing JSON better 3 years ago			`if not stripped_text_from_html:`
Re #265 - extended jsonpath support (#266) * Re #265 - Use extended JSONpath support, Allow a JSONPath selector to not match anything (yet) Adding test Correctly capture invalid JSONPath query error 3 years ago			`# Re 265 - Just return an empty string when filter not found`
			`return ''`
Re #154 - Handle missing JSON better 3 years ago
Re #154 Ldjson extract parse (#158) * Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md 3 years ago			`return stripped_text_from_html`
Ability to visualise trigger and filter rules against the current snapshot on the preview page 3 years ago
			`# Mode - "content" return the content without the matches (default)`
			`# - "line numbers" return a list of line numbers that match (int list)`
			`#`
			`# wordlist - list of regex's (str) or words (str)`
			`def strip_ignore_text(content, wordlist, mode="content"):`
			`ignore = []`
			`ignore_regex = []`

			`# @todo check this runs case insensitive`
			`for k in wordlist:`

			`# Is it a regex?`
			`if k[0] == '/':`
			`ignore_regex.append(k.strip(" /"))`
			`else:`
			`ignore.append(k)`

			`i = 0`
			`output = []`
			`ignored_line_numbers = []`
			`for line in content.splitlines():`
			`i += 1`
			`# Always ignore blank lines in this mode. (when this function gets called)`
			`if len(line.strip()):`
			`regex_matches = False`

			`# if any of these match, skip`
			`for regex in ignore_regex:`
			`try:`
			`if re.search(regex, line, re.IGNORECASE):`
			`regex_matches = True`
			`except Exception as e:`
			`continue`

Ensure string matching on the ignore filter is always case-INsensitive 3 years ago			`if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):`
Ability to visualise trigger and filter rules against the current snapshot on the preview page 3 years ago			`output.append(line.encode('utf8'))`
			`else:`
			`ignored_line_numbers.append(i)`



			`# Used for finding out what to highlight`
			`if mode == "line numbers":`
			`return ignored_line_numbers`

Add filter to remove elements by CSS rule from HTML before change detection is run (#445) 3 years ago			`return "\n".encode('utf8').join(output)`
Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes 3 years ago

			`def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:`
			`"""Converts html string to a string with just the text. If ignoring`
			`rendering anchor tag content is enable, anchor tag content are also`
			`included in the text`

			`:param html_content: string with html content`
			`:param render_anchor_tag_content: boolean flag indicating whether to extract`
			`hyperlinks (the anchor tag content) together with text. This refers to the`
			`'href' inside 'a' tags.`
			`Anchor tag content is rendered in the following manner:`
			`'[ text ](anchor tag content)'`
			`:return: extracted text from the HTML`
			`"""`
			`# if anchor tag content flag is set to True define a config for`
			`# extracting this content`
			`if render_anchor_tag_content:`

			`parser_config = ParserConfig(`
			`annotation_rules={"a": ["hyperlink"]}, display_links=True`
			`)`

			`# otherwise set config to None`
			`else:`
			`parser_config = None`

			`# get text and annotations via inscriptis`
			`text_content = get_text(html_content, config=parser_config)`

			`return text_content`

Automatically offer to track LD+JSON product price data (#1204) 2 years ago
			`# Does LD+JSON exist with a @type=='product' and a .price set anywhere?`
			`def has_ldjson_product_info(content):`
			`try:`
			`pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")`
			`except JSONNotFound as e:`
			`# Totally fine`
			`return False`
			`x=bool(pricing_data)`
			`return x`


Handle simple obfuscations - HomeDepot.com style price obfuscation (#764) 2 years ago			`def workarounds_for_obfuscations(content):`
			`"""`
			`Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis`
			`This could go into its own Pip package in the future, for faster updates`
			`"""`

			`# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>`
			`# https://github.com/weblyzard/inscriptis/issues/45`
			`if not content:`
			`return content`

			`content = re.sub('<!--\s+-->', '', content)`

			`return content`