remove stopwords
This commit is contained in:
parent
960b0260f9
commit
d821344ade
|
@ -22,6 +22,7 @@
|
|||
"@types/lodash-es": "^4.17.12",
|
||||
"@types/node": "^16.18.126",
|
||||
"@types/pako": "^2.0.3",
|
||||
"@types/stopword": "^2.0.3",
|
||||
"babel-jest": "^27.5.1",
|
||||
"builtin-modules": "^3.3.0",
|
||||
"esbuild": "0.17.19",
|
||||
|
@ -47,7 +48,8 @@
|
|||
"markdown-link-extractor": "^4.0.2",
|
||||
"minisearch": "7.1.0",
|
||||
"pure-md5": "^0.1.14",
|
||||
"search-query-parser": "^1.6.0"
|
||||
"search-query-parser": "^1.6.0",
|
||||
"stopword": "^3.1.5"
|
||||
},
|
||||
"pnpm": {
|
||||
"overrides": {
|
||||
|
|
|
@ -32,6 +32,9 @@ importers:
|
|||
search-query-parser:
|
||||
specifier: ^1.6.0
|
||||
version: 1.6.0
|
||||
stopword:
|
||||
specifier: ^3.1.5
|
||||
version: 3.1.5
|
||||
devDependencies:
|
||||
'@babel/preset-env':
|
||||
specifier: ^7.26.9
|
||||
|
@ -57,6 +60,9 @@ importers:
|
|||
'@types/pako':
|
||||
specifier: ^2.0.3
|
||||
version: 2.0.3
|
||||
'@types/stopword':
|
||||
specifier: ^2.0.3
|
||||
version: 2.0.3
|
||||
babel-jest:
|
||||
specifier: ^27.5.1
|
||||
version: 27.5.1(@babel/core@7.26.10)
|
||||
|
@ -1137,6 +1143,9 @@ packages:
|
|||
'@types/stack-utils@2.0.3':
|
||||
resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
|
||||
|
||||
'@types/stopword@2.0.3':
|
||||
resolution: {integrity: sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==}
|
||||
|
||||
'@types/tern@0.23.9':
|
||||
resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
|
||||
|
||||
|
@ -2783,6 +2792,9 @@ packages:
|
|||
resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
stopword@3.1.5:
|
||||
resolution: {integrity: sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==}
|
||||
|
||||
stream-to-array@2.3.0:
|
||||
resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
|
||||
|
||||
|
@ -4348,6 +4360,8 @@ snapshots:
|
|||
|
||||
'@types/stack-utils@2.0.3': {}
|
||||
|
||||
'@types/stopword@2.0.3': {}
|
||||
|
||||
'@types/tern@0.23.9':
|
||||
dependencies:
|
||||
'@types/estree': 1.0.6
|
||||
|
@ -6231,6 +6245,8 @@ snapshots:
|
|||
dependencies:
|
||||
escape-string-regexp: 2.0.0
|
||||
|
||||
stopword@3.1.5: {}
|
||||
|
||||
stream-to-array@2.3.0:
|
||||
dependencies:
|
||||
any-promise: 1.3.0
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import type { QueryCombination } from 'minisearch'
|
||||
import { removeStopwords } from 'stopword'
|
||||
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
|
||||
import type LocatorPlugin from '../main'
|
||||
import { splitCamelCase, splitHyphens } from '../tools/utils'
|
||||
import { getStopWords, splitCamelCase, splitHyphens } from '../tools/utils'
|
||||
|
||||
const markdownLinkExtractor = require('markdown-link-extractor')
|
||||
|
||||
|
@ -16,9 +17,8 @@ export class Tokenizer {
|
|||
*/
|
||||
public tokenizeForIndexing(text: string): string[] {
|
||||
try {
|
||||
const lang = eld.detectLanguage(text)
|
||||
console.log(lang)
|
||||
const words = this.tokenizeIntoWords(text)
|
||||
let words = this.tokenizeIntoWords(text)
|
||||
words = removeStopwords(words, getStopWords())
|
||||
let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
|
||||
|
||||
tokens = [
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import { type BinaryLike, createHash } from 'crypto'
|
||||
import {
|
||||
type CachedMetadata,
|
||||
getAllTags,
|
||||
|
@ -5,9 +6,9 @@ import {
|
|||
parseFrontMatterAliases,
|
||||
Platform,
|
||||
} from 'obsidian'
|
||||
import { isSearchMatch, type SearchMatch } from '../globals'
|
||||
import { type BinaryLike, createHash } from 'crypto'
|
||||
import { md5 } from 'pure-md5'
|
||||
import { eng, fra } from 'stopword'
|
||||
import { isSearchMatch, type SearchMatch } from '../globals'
|
||||
|
||||
export function pathWithoutFilename(path: string): string {
|
||||
const split = path.split('/')
|
||||
|
@ -279,3 +280,13 @@ export const countError = (() => {
|
|||
}
|
||||
}
|
||||
})()
|
||||
|
||||
let stopWords: string[] = []
|
||||
export function getStopWords(): string[] {
|
||||
if (!stopWords.length) {
|
||||
stopWords = [...eng, ...fra]
|
||||
// Remove duplicates
|
||||
stopWords = [...new Set(stopWords)]
|
||||
}
|
||||
return stopWords
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user