From efea9ac586db4aa95dfd4c79c4722e384a0551ff Mon Sep 17 00:00:00 2001 From: Nandakumar Chandrasekhar Date: Thu, 23 Nov 2023 10:31:48 +0530 Subject: [PATCH] Calculate word count using wordcount.lua filter --- README.md | 2 +- RELEASE.md | 3 + .../pandoc_reader/filters/wordcount.lua | 56 +++++++++++++++++++ .../plugins/pandoc_reader/pandoc_reader.py | 27 +++++++-- .../test/markdown/reading_time_content.md | 2 +- pyproject.toml | 1 - 6 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 RELEASE.md create mode 100644 pelican/plugins/pandoc_reader/filters/wordcount.lua diff --git a/README.md b/README.md index 3f62710..141d628 100644 --- a/README.md +++ b/README.md @@ -232,7 +232,7 @@ The default value for reading speed is set to 200 words per minute, but may be c READING_SPEED = ``` -The number of words in a document is calculated using the [Markdown Word Count](https://github.com/gandreadis/markdown-word-count) package. +The number of words in a document is calculated using the [wordcount Lua Filter](https://github.com/pandoc/lua-filters/tree/master/wordcount). ### Customizing the Path for the `pandoc` Executable diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000..403c8c0 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,3 @@ +Release type: minor + +* Using [wordcount Lua Filter](https://github.com/pandoc/lua-filters/tree/master/wordcount) instead of the [markdown-word-count](https://github.com/gandreadis/markdown-word-count) Python package to calculate word count \ No newline at end of file diff --git a/pelican/plugins/pandoc_reader/filters/wordcount.lua b/pelican/plugins/pandoc_reader/filters/wordcount.lua new file mode 100644 index 0000000..1406f88 --- /dev/null +++ b/pelican/plugins/pandoc_reader/filters/wordcount.lua @@ -0,0 +1,56 @@ +-- counts words in a document + +words = 0 +characters = 0 +characters_and_spaces = 0 +process_anyway = false + +wordcount = { + Str = function(el) + -- we don't count a word if it's entirely punctuation: + if el.text:match("%P") then + words = words + 1 + end + characters = characters + utf8.len(el.text) + characters_and_spaces = characters_and_spaces + utf8.len(el.text) + end, + + Space = function(el) + characters_and_spaces = characters_and_spaces + 1 + end, + + Code = function(el) + _,n = el.text:gsub("%S+","") + words = words + n + text_nospace = el.text:gsub("%s", "") + characters = characters + utf8.len(text_nospace) + characters_and_spaces = characters_and_spaces + utf8.len(el.text) + end, + + CodeBlock = function(el) + _,n = el.text:gsub("%S+","") + words = words + n + text_nospace = el.text:gsub("%s", "") + characters = characters + utf8.len(text_nospace) + characters_and_spaces = characters_and_spaces + utf8.len(el.text) + end +} + +-- check if the `wordcount` variable is set to `process-anyway` +function Meta(meta) + if meta.wordcount and (meta.wordcount=="process-anyway" + or meta.wordcount=="process" or meta.wordcount=="convert") then + process_anyway = true + end +end + +function Pandoc(el) + -- skip metadata, just count body: + pandoc.walk_block(pandoc.Div(el.blocks), wordcount) + print(words .. " words in body") + print(characters .. " characters in body") + print(characters_and_spaces .. " characters in body (including spaces)") + if not process_anyway then + os.exit(0) + end +end diff --git a/pelican/plugins/pandoc_reader/pandoc_reader.py b/pelican/plugins/pandoc_reader/pandoc_reader.py index b8b237d..99f2070 100644 --- a/pelican/plugins/pandoc_reader/pandoc_reader.py +++ b/pelican/plugins/pandoc_reader/pandoc_reader.py @@ -6,7 +6,6 @@ import subprocess import bs4 -from mwc.counter import count_words_in_markdown from ruamel.yaml import YAML, constructor from pelican import signals @@ -22,6 +21,7 @@ "%7Bfilename%7D": "{filename}", } FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"] +FILTERS_PATH = os.path.abspath(os.path.join(DIR_PATH, "filters")) PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html" PANDOC_SUPPORTED_MAJOR_VERSION = 2 PANDOC_SUPPORTED_MINOR_VERSION = 11 @@ -128,7 +128,8 @@ def _create_html(self, source_path, content, pandoc_executable): if self.settings.get("CALCULATE_READING_TIME", []): # Calculate reading time and add to metadata metadata["reading_time"] = self.process_metadata( - "reading_time", self._calculate_reading_time(content) + "reading_time", + self._calculate_reading_time(pandoc_executable, source_path), ) return output, metadata @@ -200,10 +201,28 @@ def _check_defaults(self, defaults_files): return citations, table_of_contents - def _calculate_reading_time(self, content): + def _calculate_reading_time(self, pandoc_executable, source_path): """Calculate time taken to read content.""" reading_speed = self.settings.get("READING_SPEED", DEFAULT_READING_SPEED) - wordcount = count_words_in_markdown(content) + + # Use the workcount.lua filter to calulcate the reading time + output = subprocess.run( + [ + pandoc_executable, + "--lua-filter", + os.path.join(FILTERS_PATH, "wordcount.lua"), + source_path, + ], + capture_output=True, + encoding="utf-8", + check=True, + ) + + # We have to extract the word count from stdout which looks like + # 102 words in body + # 536 characters in body + # 636 characters in body (including spaces) + wordcount = output.stdout.split()[0] time_unit = "minutes" try: diff --git a/pelican/plugins/pandoc_reader/test/markdown/reading_time_content.md b/pelican/plugins/pandoc_reader/test/markdown/reading_time_content.md index 4610dba..4b18994 100644 --- a/pelican/plugins/pandoc_reader/test/markdown/reading_time_content.md +++ b/pelican/plugins/pandoc_reader/test/markdown/reading_time_content.md @@ -6,4 +6,4 @@ date: "2020-10-16" ## What is Lorem Ipsum -Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. +Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Lorep Ipsum paragragh should be 100 words. diff --git a/pyproject.toml b/pyproject.toml index 73717aa..7f9665c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,6 @@ classifiers = [ python = ">=3.8.1,<4.0" pelican = ">=4.5" markdown = {version = "<=3.3.4", optional = true} -markdown-word-count = "^0.0.1" pyyaml = "^6.0.0" beautifulsoup4 = "^4.9.3" "ruamel.yaml" = "^0.17.32"