Fixed blog readtime calculation to ignore non-content text

squidfunk · Jul 16, 2024 · 22054eb · 22054eb
1 parent 4f8081c
commit 22054eb
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 6 deletions.
diff --git a/material/plugins/blog/readtime/parser.py b/material/plugins/blog/readtime/parser.py
@@ -31,15 +31,59 @@ class ReadtimeParser(HTMLParser):
     def __init__(self):
         super().__init__(convert_charrefs = True)
 
+        # Tags to skip
+        self.skip = set([
+            "object",                  # Objects
+            "script",                  # Scripts
+            "style",                   # Styles
+            "svg"                      # SVGs
+        ])
+
+        # Current context
+        self.context = []
+
         # Keep track of text and images
         self.text   = []
         self.images = 0
 
-    # Collect images
+    # Called at the start of every HTML tag
     def handle_starttag(self, tag, attrs):
+        # Collect images
         if tag == "img":
             self.images += 1
 
-    # Collect text
+        # Ignore self-closing tags
+        if tag not in void:
+            # Add tag to context
+            self.context.append(tag)
+
+    # Called for the text contents of each tag
     def handle_data(self, data):
-        self.text.append(data)
+        # Collect text if not inside skip context
+        if not self.skip.intersection(self.context):
+            self.text.append(data)
+
+    # Called at the end of every HTML tag
+    def handle_endtag(self, tag):
+        if self.context and self.context[-1] == tag:
+            # Remove tag from context
+            self.context.pop()
+
+
+# Tags that are self-closing
+void = set([
+    "area",                            # Image map areas
+    "base",                            # Document base
+    "br",                              # Line breaks
+    "col",                             # Table columns
+    "embed",                           # External content
+    "hr",                              # Horizontal rules
+    "img",                             # Images
+    "input",                           # Input fields
+    "link",                            # Links
+    "meta",                            # Metadata
+    "param",                           # External parameters
+    "source",                          # Image source sets
+    "track",                           # Text track
+    "wbr"                              # Line break opportunities
+])
diff --git a/src/plugins/blog/readtime/parser.py b/src/plugins/blog/readtime/parser.py
@@ -31,15 +31,59 @@ class ReadtimeParser(HTMLParser):
     def __init__(self):
         super().__init__(convert_charrefs = True)
 
+        # Tags to skip
+        self.skip = set([
+            "object",                  # Objects
+            "script",                  # Scripts
+            "style",                   # Styles
+            "svg"                      # SVGs
+        ])
+
+        # Current context
+        self.context = []
+
         # Keep track of text and images
         self.text   = []
         self.images = 0
 
-    # Collect images
+    # Called at the start of every HTML tag
     def handle_starttag(self, tag, attrs):
+        # Collect images
         if tag == "img":
             self.images += 1
 
-    # Collect text
+        # Ignore self-closing tags
+        if tag not in void:
+            # Add tag to context
+            self.context.append(tag)
+
+    # Called for the text contents of each tag
     def handle_data(self, data):
-        self.text.append(data)
+        # Collect text if not inside skip context
+        if not self.skip.intersection(self.context):
+            self.text.append(data)
+
+    # Called at the end of every HTML tag
+    def handle_endtag(self, tag):
+        if self.context and self.context[-1] == tag:
+            # Remove tag from context
+            self.context.pop()
+
+
+# Tags that are self-closing
+void = set([
+    "area",                            # Image map areas
+    "base",                            # Document base
+    "br",                              # Line breaks
+    "col",                             # Table columns
+    "embed",                           # External content
+    "hr",                              # Horizontal rules
+    "img",                             # Images
+    "input",                           # Input fields
+    "link",                            # Links
+    "meta",                            # Metadata
+    "param",                           # External parameters
+    "source",                          # Image source sets
+    "track",                           # Text track
+    "wbr"                              # Line break opportunities
+])