Skip to content

Commit

Permalink
Fixes a bug when indexing some non-breaking spaces in extended mode
Browse files Browse the repository at this point in the history
  • Loading branch information
bglw committed Sep 14, 2023
1 parent e96d7d2 commit 72aec47
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

## Unreleased

* Fixes a bug when indexing some non-breaking spaces on ja/zh language pages in extended mode

## v1.0.1 (September 14, 2023)

Hotfix for Pagefind v1.0.0, restoring default-on support for multilingual word segmentation, and helping resolve packaging issues with new dependencies.
Expand Down
55 changes: 47 additions & 8 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,18 @@ impl Fossicker {
// Only proceed if the word was broken into multiple parts
if word_parts.contains(|c: char| c.is_whitespace()) {
let part_words: Vec<_> = word_parts.split_whitespace().collect();
// Index constituents of a compound word as a proportion of the
// weight of the full word.
let per_weight =
(word_weight / part_words.len().try_into().unwrap_or(std::u8::MAX)).max(1);

// Only index two+ character words
for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
store_word(part_word, word_index, per_weight);

if !part_words.is_empty() {
// Index constituents of a compound word as a proportion of the
// weight of the full word.
let per_weight = (word_weight
/ part_words.len().try_into().unwrap_or(std::u8::MAX))
.max(1);

// Only index two+ character words
for part_word in part_words.into_iter().filter(|w| w.len() > 1) {
store_word(part_word, word_index, per_weight);
}
}
}
// Additionally store any special extra characters we are given
Expand Down Expand Up @@ -774,6 +778,41 @@ mod tests {
);
}

#[tokio::test]
async fn parse_significant_whitespace() {
let mut f = test_fossick(
[
"<html lang='ja'><body>",
"<p>Hello \u{a0} \u{a0}World ! .</p>",
"</body></html>",
]
.concat(),
)
.await;

let (digest, words, anchors, word_count) = f.parse_digest();

assert_eq!(
words,
HashMap::from_iter([
(
"hello".to_string(),
vec![FossickedWord {
position: 0,
weight: 1 * 24
}]
),
(
"world".to_string(),
vec![FossickedWord {
position: 1,
weight: 1 * 24
}]
)
])
);
}

#[cfg(not(target_os = "windows"))]
#[test]
fn building_url() {
Expand Down

0 comments on commit 72aec47

Please sign in to comment.