diff --git a/package.json b/package.json index 23e108c..97a1d1b 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "@types/lodash-es": "^4.17.12", "@types/node": "^16.18.126", "@types/pako": "^2.0.3", + "@types/stopword": "^2.0.3", "babel-jest": "^27.5.1", "builtin-modules": "^3.3.0", "esbuild": "0.17.19", @@ -47,7 +48,8 @@ "markdown-link-extractor": "^4.0.2", "minisearch": "7.1.0", "pure-md5": "^0.1.14", - "search-query-parser": "^1.6.0" + "search-query-parser": "^1.6.0", + "stopword": "^3.1.5" }, "pnpm": { "overrides": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bd39163..4bb6920 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -32,6 +32,9 @@ importers: search-query-parser: specifier: ^1.6.0 version: 1.6.0 + stopword: + specifier: ^3.1.5 + version: 3.1.5 devDependencies: '@babel/preset-env': specifier: ^7.26.9 @@ -57,6 +60,9 @@ importers: '@types/pako': specifier: ^2.0.3 version: 2.0.3 + '@types/stopword': + specifier: ^2.0.3 + version: 2.0.3 babel-jest: specifier: ^27.5.1 version: 27.5.1(@babel/core@7.26.10) @@ -1137,6 +1143,9 @@ packages: '@types/stack-utils@2.0.3': resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} + '@types/stopword@2.0.3': + resolution: {integrity: sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==} + '@types/tern@0.23.9': resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==} @@ -2783,6 +2792,9 @@ packages: resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==} engines: {node: '>=10'} + stopword@3.1.5: + resolution: {integrity: sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==} + stream-to-array@2.3.0: resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==} @@ -4348,6 +4360,8 @@ snapshots: '@types/stack-utils@2.0.3': {} + '@types/stopword@2.0.3': {} + '@types/tern@0.23.9': dependencies: '@types/estree': 1.0.6 @@ -6231,6 +6245,8 @@ snapshots: dependencies: escape-string-regexp: 2.0.0 + stopword@3.1.5: {} + stream-to-array@2.3.0: dependencies: any-promise: 1.3.0 diff --git a/src/search/tokenizer.ts b/src/search/tokenizer.ts index 8bfbecc..de73580 100644 --- a/src/search/tokenizer.ts +++ b/src/search/tokenizer.ts @@ -1,7 +1,8 @@ import type { QueryCombination } from 'minisearch' +import { removeStopwords } from 'stopword' import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals' import type LocatorPlugin from '../main' -import { splitCamelCase, splitHyphens } from '../tools/utils' +import { getStopWords, splitCamelCase, splitHyphens } from '../tools/utils' const markdownLinkExtractor = require('markdown-link-extractor') @@ -16,9 +17,8 @@ export class Tokenizer { */ public tokenizeForIndexing(text: string): string[] { try { - const lang = eld.detectLanguage(text) - console.log(lang) - const words = this.tokenizeIntoWords(text) + let words = this.tokenizeIntoWords(text) + words = removeStopwords(words, getStopWords()) let tokens = this.tokenizeIntoTokens(text, { skipChs: true }) tokens = [ diff --git a/src/tools/utils.ts b/src/tools/utils.ts index 218c156..fe535c6 100644 --- a/src/tools/utils.ts +++ b/src/tools/utils.ts @@ -1,3 +1,4 @@ +import { type BinaryLike, createHash } from 'crypto' import { type CachedMetadata, getAllTags, @@ -5,9 +6,9 @@ import { parseFrontMatterAliases, Platform, } from 'obsidian' -import { isSearchMatch, type SearchMatch } from '../globals' -import { type BinaryLike, createHash } from 'crypto' import { md5 } from 'pure-md5' +import { eng, fra } from 'stopword' +import { isSearchMatch, type SearchMatch } from '../globals' export function pathWithoutFilename(path: string): string { const split = path.split('/') @@ -279,3 +280,13 @@ export const countError = (() => { } } })() + +let stopWords: string[] = [] +export function getStopWords(): string[] { + if (!stopWords.length) { + stopWords = [...eng, ...fra] + // Remove duplicates + stopWords = [...new Set(stopWords)] + } + return stopWords +}