remove stopwords
This commit is contained in:
parent
960b0260f9
commit
d821344ade
|
@ -22,6 +22,7 @@
|
||||||
"@types/lodash-es": "^4.17.12",
|
"@types/lodash-es": "^4.17.12",
|
||||||
"@types/node": "^16.18.126",
|
"@types/node": "^16.18.126",
|
||||||
"@types/pako": "^2.0.3",
|
"@types/pako": "^2.0.3",
|
||||||
|
"@types/stopword": "^2.0.3",
|
||||||
"babel-jest": "^27.5.1",
|
"babel-jest": "^27.5.1",
|
||||||
"builtin-modules": "^3.3.0",
|
"builtin-modules": "^3.3.0",
|
||||||
"esbuild": "0.17.19",
|
"esbuild": "0.17.19",
|
||||||
|
@ -47,7 +48,8 @@
|
||||||
"markdown-link-extractor": "^4.0.2",
|
"markdown-link-extractor": "^4.0.2",
|
||||||
"minisearch": "7.1.0",
|
"minisearch": "7.1.0",
|
||||||
"pure-md5": "^0.1.14",
|
"pure-md5": "^0.1.14",
|
||||||
"search-query-parser": "^1.6.0"
|
"search-query-parser": "^1.6.0",
|
||||||
|
"stopword": "^3.1.5"
|
||||||
},
|
},
|
||||||
"pnpm": {
|
"pnpm": {
|
||||||
"overrides": {
|
"overrides": {
|
||||||
|
|
|
@ -32,6 +32,9 @@ importers:
|
||||||
search-query-parser:
|
search-query-parser:
|
||||||
specifier: ^1.6.0
|
specifier: ^1.6.0
|
||||||
version: 1.6.0
|
version: 1.6.0
|
||||||
|
stopword:
|
||||||
|
specifier: ^3.1.5
|
||||||
|
version: 3.1.5
|
||||||
devDependencies:
|
devDependencies:
|
||||||
'@babel/preset-env':
|
'@babel/preset-env':
|
||||||
specifier: ^7.26.9
|
specifier: ^7.26.9
|
||||||
|
@ -57,6 +60,9 @@ importers:
|
||||||
'@types/pako':
|
'@types/pako':
|
||||||
specifier: ^2.0.3
|
specifier: ^2.0.3
|
||||||
version: 2.0.3
|
version: 2.0.3
|
||||||
|
'@types/stopword':
|
||||||
|
specifier: ^2.0.3
|
||||||
|
version: 2.0.3
|
||||||
babel-jest:
|
babel-jest:
|
||||||
specifier: ^27.5.1
|
specifier: ^27.5.1
|
||||||
version: 27.5.1(@babel/core@7.26.10)
|
version: 27.5.1(@babel/core@7.26.10)
|
||||||
|
@ -1137,6 +1143,9 @@ packages:
|
||||||
'@types/stack-utils@2.0.3':
|
'@types/stack-utils@2.0.3':
|
||||||
resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
|
resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
|
||||||
|
|
||||||
|
'@types/stopword@2.0.3':
|
||||||
|
resolution: {integrity: sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==}
|
||||||
|
|
||||||
'@types/tern@0.23.9':
|
'@types/tern@0.23.9':
|
||||||
resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
|
resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
|
||||||
|
|
||||||
|
@ -2783,6 +2792,9 @@ packages:
|
||||||
resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
|
resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
|
||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
|
|
||||||
|
stopword@3.1.5:
|
||||||
|
resolution: {integrity: sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==}
|
||||||
|
|
||||||
stream-to-array@2.3.0:
|
stream-to-array@2.3.0:
|
||||||
resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
|
resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
|
||||||
|
|
||||||
|
@ -4348,6 +4360,8 @@ snapshots:
|
||||||
|
|
||||||
'@types/stack-utils@2.0.3': {}
|
'@types/stack-utils@2.0.3': {}
|
||||||
|
|
||||||
|
'@types/stopword@2.0.3': {}
|
||||||
|
|
||||||
'@types/tern@0.23.9':
|
'@types/tern@0.23.9':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/estree': 1.0.6
|
'@types/estree': 1.0.6
|
||||||
|
@ -6231,6 +6245,8 @@ snapshots:
|
||||||
dependencies:
|
dependencies:
|
||||||
escape-string-regexp: 2.0.0
|
escape-string-regexp: 2.0.0
|
||||||
|
|
||||||
|
stopword@3.1.5: {}
|
||||||
|
|
||||||
stream-to-array@2.3.0:
|
stream-to-array@2.3.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
any-promise: 1.3.0
|
any-promise: 1.3.0
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import type { QueryCombination } from 'minisearch'
|
import type { QueryCombination } from 'minisearch'
|
||||||
|
import { removeStopwords } from 'stopword'
|
||||||
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
|
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
|
||||||
import type LocatorPlugin from '../main'
|
import type LocatorPlugin from '../main'
|
||||||
import { splitCamelCase, splitHyphens } from '../tools/utils'
|
import { getStopWords, splitCamelCase, splitHyphens } from '../tools/utils'
|
||||||
|
|
||||||
const markdownLinkExtractor = require('markdown-link-extractor')
|
const markdownLinkExtractor = require('markdown-link-extractor')
|
||||||
|
|
||||||
|
@ -16,9 +17,8 @@ export class Tokenizer {
|
||||||
*/
|
*/
|
||||||
public tokenizeForIndexing(text: string): string[] {
|
public tokenizeForIndexing(text: string): string[] {
|
||||||
try {
|
try {
|
||||||
const lang = eld.detectLanguage(text)
|
let words = this.tokenizeIntoWords(text)
|
||||||
console.log(lang)
|
words = removeStopwords(words, getStopWords())
|
||||||
const words = this.tokenizeIntoWords(text)
|
|
||||||
let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
|
let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
|
||||||
|
|
||||||
tokens = [
|
tokens = [
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import { type BinaryLike, createHash } from 'crypto'
|
||||||
import {
|
import {
|
||||||
type CachedMetadata,
|
type CachedMetadata,
|
||||||
getAllTags,
|
getAllTags,
|
||||||
|
@ -5,9 +6,9 @@ import {
|
||||||
parseFrontMatterAliases,
|
parseFrontMatterAliases,
|
||||||
Platform,
|
Platform,
|
||||||
} from 'obsidian'
|
} from 'obsidian'
|
||||||
import { isSearchMatch, type SearchMatch } from '../globals'
|
|
||||||
import { type BinaryLike, createHash } from 'crypto'
|
|
||||||
import { md5 } from 'pure-md5'
|
import { md5 } from 'pure-md5'
|
||||||
|
import { eng, fra } from 'stopword'
|
||||||
|
import { isSearchMatch, type SearchMatch } from '../globals'
|
||||||
|
|
||||||
export function pathWithoutFilename(path: string): string {
|
export function pathWithoutFilename(path: string): string {
|
||||||
const split = path.split('/')
|
const split = path.split('/')
|
||||||
|
@ -279,3 +280,13 @@ export const countError = (() => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})()
|
})()
|
||||||
|
|
||||||
|
let stopWords: string[] = []
|
||||||
|
export function getStopWords(): string[] {
|
||||||
|
if (!stopWords.length) {
|
||||||
|
stopWords = [...eng, ...fra]
|
||||||
|
// Remove duplicates
|
||||||
|
stopWords = [...new Set(stopWords)]
|
||||||
|
}
|
||||||
|
return stopWords
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user