Skip to content

Commit

Permalink
Calculate word count using wordcount.lua filter
Browse files Browse the repository at this point in the history
  • Loading branch information
nandac committed Nov 23, 2023
1 parent 89b78d5 commit efea9ac
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ The default value for reading speed is set to 200 words per minute, but may be c
READING_SPEED = <words-per-minute>
```

The number of words in a document is calculated using the [Markdown Word Count](https://github.com/gandreadis/markdown-word-count) package.
The number of words in a document is calculated using the [wordcount Lua Filter](https://github.com/pandoc/lua-filters/tree/master/wordcount).

### Customizing the Path for the `pandoc` Executable

Expand Down
3 changes: 3 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Release type: minor

* Using [wordcount Lua Filter](https://github.com/pandoc/lua-filters/tree/master/wordcount) instead of the [markdown-word-count](https://github.com/gandreadis/markdown-word-count) Python package to calculate word count
56 changes: 56 additions & 0 deletions pelican/plugins/pandoc_reader/filters/wordcount.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- counts words in a document

words = 0
characters = 0
characters_and_spaces = 0
process_anyway = false

wordcount = {
Str = function(el)
-- we don't count a word if it's entirely punctuation:
if el.text:match("%P") then
words = words + 1
end
characters = characters + utf8.len(el.text)
characters_and_spaces = characters_and_spaces + utf8.len(el.text)
end,

Space = function(el)
characters_and_spaces = characters_and_spaces + 1
end,

Code = function(el)
_,n = el.text:gsub("%S+","")
words = words + n
text_nospace = el.text:gsub("%s", "")
characters = characters + utf8.len(text_nospace)
characters_and_spaces = characters_and_spaces + utf8.len(el.text)
end,

CodeBlock = function(el)
_,n = el.text:gsub("%S+","")
words = words + n
text_nospace = el.text:gsub("%s", "")
characters = characters + utf8.len(text_nospace)
characters_and_spaces = characters_and_spaces + utf8.len(el.text)
end
}

-- check if the `wordcount` variable is set to `process-anyway`
function Meta(meta)
if meta.wordcount and (meta.wordcount=="process-anyway"
or meta.wordcount=="process" or meta.wordcount=="convert") then
process_anyway = true
end
end

function Pandoc(el)
-- skip metadata, just count body:
pandoc.walk_block(pandoc.Div(el.blocks), wordcount)
print(words .. " words in body")
print(characters .. " characters in body")
print(characters_and_spaces .. " characters in body (including spaces)")
if not process_anyway then
os.exit(0)
end
end
27 changes: 23 additions & 4 deletions pelican/plugins/pandoc_reader/pandoc_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import subprocess

import bs4
from mwc.counter import count_words_in_markdown
from ruamel.yaml import YAML, constructor

from pelican import signals
Expand All @@ -22,6 +21,7 @@
"%7Bfilename%7D": "{filename}",
}
FILE_EXTENSIONS = ["md", "mkd", "mkdn", "mdwn", "mdown", "markdown", "Rmd"]
FILTERS_PATH = os.path.abspath(os.path.join(DIR_PATH, "filters"))
PANDOC_READER_HTML_TEMPLATE = "pandoc-reader-default.html"
PANDOC_SUPPORTED_MAJOR_VERSION = 2
PANDOC_SUPPORTED_MINOR_VERSION = 11
Expand Down Expand Up @@ -128,7 +128,8 @@ def _create_html(self, source_path, content, pandoc_executable):
if self.settings.get("CALCULATE_READING_TIME", []):
# Calculate reading time and add to metadata
metadata["reading_time"] = self.process_metadata(
"reading_time", self._calculate_reading_time(content)
"reading_time",
self._calculate_reading_time(pandoc_executable, source_path),
)

return output, metadata
Expand Down Expand Up @@ -200,10 +201,28 @@ def _check_defaults(self, defaults_files):

return citations, table_of_contents

def _calculate_reading_time(self, content):
def _calculate_reading_time(self, pandoc_executable, source_path):
"""Calculate time taken to read content."""
reading_speed = self.settings.get("READING_SPEED", DEFAULT_READING_SPEED)
wordcount = count_words_in_markdown(content)

# Use the workcount.lua filter to calulcate the reading time
output = subprocess.run(
[
pandoc_executable,
"--lua-filter",
os.path.join(FILTERS_PATH, "wordcount.lua"),
source_path,
],
capture_output=True,
encoding="utf-8",
check=True,
)

# We have to extract the word count from stdout which looks like
# 102 words in body
# 536 characters in body
# 636 characters in body (including spaces)
wordcount = output.stdout.split()[0]

time_unit = "minutes"
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ date: "2020-10-16"

## What is Lorem Ipsum

Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Lorep Ipsum paragragh should be 100 words.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ classifiers = [
python = ">=3.8.1,<4.0"
pelican = ">=4.5"
markdown = {version = "<=3.3.4", optional = true}
markdown-word-count = "^0.0.1"
pyyaml = "^6.0.0"
beautifulsoup4 = "^4.9.3"
"ruamel.yaml" = "^0.17.32"
Expand Down

0 comments on commit efea9ac

Please sign in to comment.