remove stopwords

This commit is contained in:
Simon Cambier 2025-06-25 21:29:15 +02:00
parent 960b0260f9
commit d821344ade
4 changed files with 36 additions and 7 deletions

View File

@ -22,6 +22,7 @@
"@types/lodash-es": "^4.17.12", "@types/lodash-es": "^4.17.12",
"@types/node": "^16.18.126", "@types/node": "^16.18.126",
"@types/pako": "^2.0.3", "@types/pako": "^2.0.3",
"@types/stopword": "^2.0.3",
"babel-jest": "^27.5.1", "babel-jest": "^27.5.1",
"builtin-modules": "^3.3.0", "builtin-modules": "^3.3.0",
"esbuild": "0.17.19", "esbuild": "0.17.19",
@ -47,7 +48,8 @@
"markdown-link-extractor": "^4.0.2", "markdown-link-extractor": "^4.0.2",
"minisearch": "7.1.0", "minisearch": "7.1.0",
"pure-md5": "^0.1.14", "pure-md5": "^0.1.14",
"search-query-parser": "^1.6.0" "search-query-parser": "^1.6.0",
"stopword": "^3.1.5"
}, },
"pnpm": { "pnpm": {
"overrides": { "overrides": {

View File

@ -32,6 +32,9 @@ importers:
search-query-parser: search-query-parser:
specifier: ^1.6.0 specifier: ^1.6.0
version: 1.6.0 version: 1.6.0
stopword:
specifier: ^3.1.5
version: 3.1.5
devDependencies: devDependencies:
'@babel/preset-env': '@babel/preset-env':
specifier: ^7.26.9 specifier: ^7.26.9
@ -57,6 +60,9 @@ importers:
'@types/pako': '@types/pako':
specifier: ^2.0.3 specifier: ^2.0.3
version: 2.0.3 version: 2.0.3
'@types/stopword':
specifier: ^2.0.3
version: 2.0.3
babel-jest: babel-jest:
specifier: ^27.5.1 specifier: ^27.5.1
version: 27.5.1(@babel/core@7.26.10) version: 27.5.1(@babel/core@7.26.10)
@ -1137,6 +1143,9 @@ packages:
'@types/stack-utils@2.0.3': '@types/stack-utils@2.0.3':
resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
'@types/stopword@2.0.3':
resolution: {integrity: sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==}
'@types/tern@0.23.9': '@types/tern@0.23.9':
resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==} resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
@ -2783,6 +2792,9 @@ packages:
resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==} resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
engines: {node: '>=10'} engines: {node: '>=10'}
stopword@3.1.5:
resolution: {integrity: sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==}
stream-to-array@2.3.0: stream-to-array@2.3.0:
resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==} resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
@ -4348,6 +4360,8 @@ snapshots:
'@types/stack-utils@2.0.3': {} '@types/stack-utils@2.0.3': {}
'@types/stopword@2.0.3': {}
'@types/tern@0.23.9': '@types/tern@0.23.9':
dependencies: dependencies:
'@types/estree': 1.0.6 '@types/estree': 1.0.6
@ -6231,6 +6245,8 @@ snapshots:
dependencies: dependencies:
escape-string-regexp: 2.0.0 escape-string-regexp: 2.0.0
stopword@3.1.5: {}
stream-to-array@2.3.0: stream-to-array@2.3.0:
dependencies: dependencies:
any-promise: 1.3.0 any-promise: 1.3.0

View File

@ -1,7 +1,8 @@
import type { QueryCombination } from 'minisearch' import type { QueryCombination } from 'minisearch'
import { removeStopwords } from 'stopword'
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals' import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
import type LocatorPlugin from '../main' import type LocatorPlugin from '../main'
import { splitCamelCase, splitHyphens } from '../tools/utils' import { getStopWords, splitCamelCase, splitHyphens } from '../tools/utils'
const markdownLinkExtractor = require('markdown-link-extractor') const markdownLinkExtractor = require('markdown-link-extractor')
@ -16,9 +17,8 @@ export class Tokenizer {
*/ */
public tokenizeForIndexing(text: string): string[] { public tokenizeForIndexing(text: string): string[] {
try { try {
const lang = eld.detectLanguage(text) let words = this.tokenizeIntoWords(text)
console.log(lang) words = removeStopwords(words, getStopWords())
const words = this.tokenizeIntoWords(text)
let tokens = this.tokenizeIntoTokens(text, { skipChs: true }) let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
tokens = [ tokens = [

View File

@ -1,3 +1,4 @@
import { type BinaryLike, createHash } from 'crypto'
import { import {
type CachedMetadata, type CachedMetadata,
getAllTags, getAllTags,
@ -5,9 +6,9 @@ import {
parseFrontMatterAliases, parseFrontMatterAliases,
Platform, Platform,
} from 'obsidian' } from 'obsidian'
import { isSearchMatch, type SearchMatch } from '../globals'
import { type BinaryLike, createHash } from 'crypto'
import { md5 } from 'pure-md5' import { md5 } from 'pure-md5'
import { eng, fra } from 'stopword'
import { isSearchMatch, type SearchMatch } from '../globals'
export function pathWithoutFilename(path: string): string { export function pathWithoutFilename(path: string): string {
const split = path.split('/') const split = path.split('/')
@ -279,3 +280,13 @@ export const countError = (() => {
} }
} }
})() })()
let stopWords: string[] = []
export function getStopWords(): string[] {
if (!stopWords.length) {
stopWords = [...eng, ...fra]
// Remove duplicates
stopWords = [...new Set(stopWords)]
}
return stopWords
}