Restock/Price detection - Better catching of errors when parsing metadata documents for restock/price check (#2602)

2 months ago · 55fe2abf42
parent 4225900ec3
commit 55fe2abf42
1 changed files with 6 additions and 3 deletions
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock:
    import extruct
    logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")

-    value = {}
    now = time.time()
-    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.

+    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
-
+    try:
        data = extruct.extract(html_content, syntaxes=syntaxes)
+    except Exception as e:
+        logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
+        return Restock()
+
    logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")

    # First phase, dead simple scanning of anything that looks useful