From c1a0481ec0625f64bdac66caa96ceffbb17324a4 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 23 May 2024 09:59:22 +0200 Subject: [PATCH] skip rdfa? --- changedetectionio/model/Watch.py | 1 + changedetectionio/processors/restock_diff.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index a88a220d..178187ee 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -197,6 +197,7 @@ class model(watch_base): return True return False + # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. @property def newest_history_key(self): diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index f7e7401d..0005b5b2 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -38,11 +38,13 @@ def get_itemprop_availability(html_content) -> Restock: import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") - value = {} now = time.time() - # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7) - data = extruct.extract(html_content) + # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. + + syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] + + data = extruct.extract(html_content, syntaxes=syntaxes) logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") # First phase, dead simple scanning of anything that looks useful