remove stopwords
This commit is contained in:
		
							parent
							
								
									960b0260f9
								
							
						
					
					
						commit
						d821344ade
					
				| 
						 | 
				
			
			@ -22,6 +22,7 @@
 | 
			
		|||
    "@types/lodash-es": "^4.17.12",
 | 
			
		||||
    "@types/node": "^16.18.126",
 | 
			
		||||
    "@types/pako": "^2.0.3",
 | 
			
		||||
    "@types/stopword": "^2.0.3",
 | 
			
		||||
    "babel-jest": "^27.5.1",
 | 
			
		||||
    "builtin-modules": "^3.3.0",
 | 
			
		||||
    "esbuild": "0.17.19",
 | 
			
		||||
| 
						 | 
				
			
			@ -47,7 +48,8 @@
 | 
			
		|||
    "markdown-link-extractor": "^4.0.2",
 | 
			
		||||
    "minisearch": "7.1.0",
 | 
			
		||||
    "pure-md5": "^0.1.14",
 | 
			
		||||
    "search-query-parser": "^1.6.0"
 | 
			
		||||
    "search-query-parser": "^1.6.0",
 | 
			
		||||
    "stopword": "^3.1.5"
 | 
			
		||||
  },
 | 
			
		||||
  "pnpm": {
 | 
			
		||||
    "overrides": {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -32,6 +32,9 @@ importers:
 | 
			
		|||
      search-query-parser:
 | 
			
		||||
        specifier: ^1.6.0
 | 
			
		||||
        version: 1.6.0
 | 
			
		||||
      stopword:
 | 
			
		||||
        specifier: ^3.1.5
 | 
			
		||||
        version: 3.1.5
 | 
			
		||||
    devDependencies:
 | 
			
		||||
      '@babel/preset-env':
 | 
			
		||||
        specifier: ^7.26.9
 | 
			
		||||
| 
						 | 
				
			
			@ -57,6 +60,9 @@ importers:
 | 
			
		|||
      '@types/pako':
 | 
			
		||||
        specifier: ^2.0.3
 | 
			
		||||
        version: 2.0.3
 | 
			
		||||
      '@types/stopword':
 | 
			
		||||
        specifier: ^2.0.3
 | 
			
		||||
        version: 2.0.3
 | 
			
		||||
      babel-jest:
 | 
			
		||||
        specifier: ^27.5.1
 | 
			
		||||
        version: 27.5.1(@babel/core@7.26.10)
 | 
			
		||||
| 
						 | 
				
			
			@ -1137,6 +1143,9 @@ packages:
 | 
			
		|||
  '@types/stack-utils@2.0.3':
 | 
			
		||||
    resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
 | 
			
		||||
 | 
			
		||||
  '@types/stopword@2.0.3':
 | 
			
		||||
    resolution: {integrity: sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==}
 | 
			
		||||
 | 
			
		||||
  '@types/tern@0.23.9':
 | 
			
		||||
    resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2783,6 +2792,9 @@ packages:
 | 
			
		|||
    resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
 | 
			
		||||
    engines: {node: '>=10'}
 | 
			
		||||
 | 
			
		||||
  stopword@3.1.5:
 | 
			
		||||
    resolution: {integrity: sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==}
 | 
			
		||||
 | 
			
		||||
  stream-to-array@2.3.0:
 | 
			
		||||
    resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -4348,6 +4360,8 @@ snapshots:
 | 
			
		|||
 | 
			
		||||
  '@types/stack-utils@2.0.3': {}
 | 
			
		||||
 | 
			
		||||
  '@types/stopword@2.0.3': {}
 | 
			
		||||
 | 
			
		||||
  '@types/tern@0.23.9':
 | 
			
		||||
    dependencies:
 | 
			
		||||
      '@types/estree': 1.0.6
 | 
			
		||||
| 
						 | 
				
			
			@ -6231,6 +6245,8 @@ snapshots:
 | 
			
		|||
    dependencies:
 | 
			
		||||
      escape-string-regexp: 2.0.0
 | 
			
		||||
 | 
			
		||||
  stopword@3.1.5: {}
 | 
			
		||||
 | 
			
		||||
  stream-to-array@2.3.0:
 | 
			
		||||
    dependencies:
 | 
			
		||||
      any-promise: 1.3.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
import type { QueryCombination } from 'minisearch'
 | 
			
		||||
import { removeStopwords } from 'stopword'
 | 
			
		||||
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
 | 
			
		||||
import type LocatorPlugin from '../main'
 | 
			
		||||
import { splitCamelCase, splitHyphens } from '../tools/utils'
 | 
			
		||||
import { getStopWords, splitCamelCase, splitHyphens } from '../tools/utils'
 | 
			
		||||
 | 
			
		||||
const markdownLinkExtractor = require('markdown-link-extractor')
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -16,9 +17,8 @@ export class Tokenizer {
 | 
			
		|||
   */
 | 
			
		||||
  public tokenizeForIndexing(text: string): string[] {
 | 
			
		||||
    try {
 | 
			
		||||
      const lang = eld.detectLanguage(text)
 | 
			
		||||
      console.log(lang)
 | 
			
		||||
      const words = this.tokenizeIntoWords(text)
 | 
			
		||||
      let words = this.tokenizeIntoWords(text)
 | 
			
		||||
      words = removeStopwords(words, getStopWords())
 | 
			
		||||
      let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
 | 
			
		||||
 | 
			
		||||
      tokens = [
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,4 @@
 | 
			
		|||
import { type BinaryLike, createHash } from 'crypto'
 | 
			
		||||
import {
 | 
			
		||||
  type CachedMetadata,
 | 
			
		||||
  getAllTags,
 | 
			
		||||
| 
						 | 
				
			
			@ -5,9 +6,9 @@ import {
 | 
			
		|||
  parseFrontMatterAliases,
 | 
			
		||||
  Platform,
 | 
			
		||||
} from 'obsidian'
 | 
			
		||||
import { isSearchMatch, type SearchMatch } from '../globals'
 | 
			
		||||
import { type BinaryLike, createHash } from 'crypto'
 | 
			
		||||
import { md5 } from 'pure-md5'
 | 
			
		||||
import { eng, fra } from 'stopword'
 | 
			
		||||
import { isSearchMatch, type SearchMatch } from '../globals'
 | 
			
		||||
 | 
			
		||||
export function pathWithoutFilename(path: string): string {
 | 
			
		||||
  const split = path.split('/')
 | 
			
		||||
| 
						 | 
				
			
			@ -279,3 +280,13 @@ export const countError = (() => {
 | 
			
		|||
    }
 | 
			
		||||
  }
 | 
			
		||||
})()
 | 
			
		||||
 | 
			
		||||
let stopWords: string[] = []
 | 
			
		||||
export function getStopWords(): string[] {
 | 
			
		||||
  if (!stopWords.length) {
 | 
			
		||||
    stopWords = [...eng, ...fra]
 | 
			
		||||
    // Remove duplicates
 | 
			
		||||
    stopWords = [...new Set(stopWords)]
 | 
			
		||||
  }
 | 
			
		||||
  return stopWords
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user