remove stopwords
This commit is contained in:
		
							parent
							
								
									960b0260f9
								
							
						
					
					
						commit
						d821344ade
					
				| 
						 | 
					@ -22,6 +22,7 @@
 | 
				
			||||||
    "@types/lodash-es": "^4.17.12",
 | 
					    "@types/lodash-es": "^4.17.12",
 | 
				
			||||||
    "@types/node": "^16.18.126",
 | 
					    "@types/node": "^16.18.126",
 | 
				
			||||||
    "@types/pako": "^2.0.3",
 | 
					    "@types/pako": "^2.0.3",
 | 
				
			||||||
 | 
					    "@types/stopword": "^2.0.3",
 | 
				
			||||||
    "babel-jest": "^27.5.1",
 | 
					    "babel-jest": "^27.5.1",
 | 
				
			||||||
    "builtin-modules": "^3.3.0",
 | 
					    "builtin-modules": "^3.3.0",
 | 
				
			||||||
    "esbuild": "0.17.19",
 | 
					    "esbuild": "0.17.19",
 | 
				
			||||||
| 
						 | 
					@ -47,7 +48,8 @@
 | 
				
			||||||
    "markdown-link-extractor": "^4.0.2",
 | 
					    "markdown-link-extractor": "^4.0.2",
 | 
				
			||||||
    "minisearch": "7.1.0",
 | 
					    "minisearch": "7.1.0",
 | 
				
			||||||
    "pure-md5": "^0.1.14",
 | 
					    "pure-md5": "^0.1.14",
 | 
				
			||||||
    "search-query-parser": "^1.6.0"
 | 
					    "search-query-parser": "^1.6.0",
 | 
				
			||||||
 | 
					    "stopword": "^3.1.5"
 | 
				
			||||||
  },
 | 
					  },
 | 
				
			||||||
  "pnpm": {
 | 
					  "pnpm": {
 | 
				
			||||||
    "overrides": {
 | 
					    "overrides": {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,6 +32,9 @@ importers:
 | 
				
			||||||
      search-query-parser:
 | 
					      search-query-parser:
 | 
				
			||||||
        specifier: ^1.6.0
 | 
					        specifier: ^1.6.0
 | 
				
			||||||
        version: 1.6.0
 | 
					        version: 1.6.0
 | 
				
			||||||
 | 
					      stopword:
 | 
				
			||||||
 | 
					        specifier: ^3.1.5
 | 
				
			||||||
 | 
					        version: 3.1.5
 | 
				
			||||||
    devDependencies:
 | 
					    devDependencies:
 | 
				
			||||||
      '@babel/preset-env':
 | 
					      '@babel/preset-env':
 | 
				
			||||||
        specifier: ^7.26.9
 | 
					        specifier: ^7.26.9
 | 
				
			||||||
| 
						 | 
					@ -57,6 +60,9 @@ importers:
 | 
				
			||||||
      '@types/pako':
 | 
					      '@types/pako':
 | 
				
			||||||
        specifier: ^2.0.3
 | 
					        specifier: ^2.0.3
 | 
				
			||||||
        version: 2.0.3
 | 
					        version: 2.0.3
 | 
				
			||||||
 | 
					      '@types/stopword':
 | 
				
			||||||
 | 
					        specifier: ^2.0.3
 | 
				
			||||||
 | 
					        version: 2.0.3
 | 
				
			||||||
      babel-jest:
 | 
					      babel-jest:
 | 
				
			||||||
        specifier: ^27.5.1
 | 
					        specifier: ^27.5.1
 | 
				
			||||||
        version: 27.5.1(@babel/core@7.26.10)
 | 
					        version: 27.5.1(@babel/core@7.26.10)
 | 
				
			||||||
| 
						 | 
					@ -1137,6 +1143,9 @@ packages:
 | 
				
			||||||
  '@types/stack-utils@2.0.3':
 | 
					  '@types/stack-utils@2.0.3':
 | 
				
			||||||
    resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
 | 
					    resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  '@types/stopword@2.0.3':
 | 
				
			||||||
 | 
					    resolution: {integrity: sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  '@types/tern@0.23.9':
 | 
					  '@types/tern@0.23.9':
 | 
				
			||||||
    resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
 | 
					    resolution: {integrity: sha512-ypzHFE/wBzh+BlH6rrBgS5I/Z7RD21pGhZ2rltb/+ZrVM1awdZwjx7hE5XfuYgHWk9uvV5HLZN3SloevCAp3Bw==}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2783,6 +2792,9 @@ packages:
 | 
				
			||||||
    resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
 | 
					    resolution: {integrity: sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==}
 | 
				
			||||||
    engines: {node: '>=10'}
 | 
					    engines: {node: '>=10'}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  stopword@3.1.5:
 | 
				
			||||||
 | 
					    resolution: {integrity: sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  stream-to-array@2.3.0:
 | 
					  stream-to-array@2.3.0:
 | 
				
			||||||
    resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
 | 
					    resolution: {integrity: sha512-UsZtOYEn4tWU2RGLOXr/o/xjRBftZRlG3dEWoaHr8j4GuypJ3isitGbVyjQKAuMu+xbiop8q224TjiZWc4XTZA==}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4348,6 +4360,8 @@ snapshots:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  '@types/stack-utils@2.0.3': {}
 | 
					  '@types/stack-utils@2.0.3': {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  '@types/stopword@2.0.3': {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  '@types/tern@0.23.9':
 | 
					  '@types/tern@0.23.9':
 | 
				
			||||||
    dependencies:
 | 
					    dependencies:
 | 
				
			||||||
      '@types/estree': 1.0.6
 | 
					      '@types/estree': 1.0.6
 | 
				
			||||||
| 
						 | 
					@ -6231,6 +6245,8 @@ snapshots:
 | 
				
			||||||
    dependencies:
 | 
					    dependencies:
 | 
				
			||||||
      escape-string-regexp: 2.0.0
 | 
					      escape-string-regexp: 2.0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  stopword@3.1.5: {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  stream-to-array@2.3.0:
 | 
					  stream-to-array@2.3.0:
 | 
				
			||||||
    dependencies:
 | 
					    dependencies:
 | 
				
			||||||
      any-promise: 1.3.0
 | 
					      any-promise: 1.3.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,8 @@
 | 
				
			||||||
import type { QueryCombination } from 'minisearch'
 | 
					import type { QueryCombination } from 'minisearch'
 | 
				
			||||||
 | 
					import { removeStopwords } from 'stopword'
 | 
				
			||||||
import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
 | 
					import { BRACKETS_AND_SPACE, chsRegex, SPACE_OR_PUNCTUATION } from '../globals'
 | 
				
			||||||
import type LocatorPlugin from '../main'
 | 
					import type LocatorPlugin from '../main'
 | 
				
			||||||
import { splitCamelCase, splitHyphens } from '../tools/utils'
 | 
					import { getStopWords, splitCamelCase, splitHyphens } from '../tools/utils'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const markdownLinkExtractor = require('markdown-link-extractor')
 | 
					const markdownLinkExtractor = require('markdown-link-extractor')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,9 +17,8 @@ export class Tokenizer {
 | 
				
			||||||
   */
 | 
					   */
 | 
				
			||||||
  public tokenizeForIndexing(text: string): string[] {
 | 
					  public tokenizeForIndexing(text: string): string[] {
 | 
				
			||||||
    try {
 | 
					    try {
 | 
				
			||||||
      const lang = eld.detectLanguage(text)
 | 
					      let words = this.tokenizeIntoWords(text)
 | 
				
			||||||
      console.log(lang)
 | 
					      words = removeStopwords(words, getStopWords())
 | 
				
			||||||
      const words = this.tokenizeIntoWords(text)
 | 
					 | 
				
			||||||
      let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
 | 
					      let tokens = this.tokenizeIntoTokens(text, { skipChs: true })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      tokens = [
 | 
					      tokens = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					import { type BinaryLike, createHash } from 'crypto'
 | 
				
			||||||
import {
 | 
					import {
 | 
				
			||||||
  type CachedMetadata,
 | 
					  type CachedMetadata,
 | 
				
			||||||
  getAllTags,
 | 
					  getAllTags,
 | 
				
			||||||
| 
						 | 
					@ -5,9 +6,9 @@ import {
 | 
				
			||||||
  parseFrontMatterAliases,
 | 
					  parseFrontMatterAliases,
 | 
				
			||||||
  Platform,
 | 
					  Platform,
 | 
				
			||||||
} from 'obsidian'
 | 
					} from 'obsidian'
 | 
				
			||||||
import { isSearchMatch, type SearchMatch } from '../globals'
 | 
					 | 
				
			||||||
import { type BinaryLike, createHash } from 'crypto'
 | 
					 | 
				
			||||||
import { md5 } from 'pure-md5'
 | 
					import { md5 } from 'pure-md5'
 | 
				
			||||||
 | 
					import { eng, fra } from 'stopword'
 | 
				
			||||||
 | 
					import { isSearchMatch, type SearchMatch } from '../globals'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export function pathWithoutFilename(path: string): string {
 | 
					export function pathWithoutFilename(path: string): string {
 | 
				
			||||||
  const split = path.split('/')
 | 
					  const split = path.split('/')
 | 
				
			||||||
| 
						 | 
					@ -279,3 +280,13 @@ export const countError = (() => {
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
})()
 | 
					})()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let stopWords: string[] = []
 | 
				
			||||||
 | 
					export function getStopWords(): string[] {
 | 
				
			||||||
 | 
					  if (!stopWords.length) {
 | 
				
			||||||
 | 
					    stopWords = [...eng, ...fra]
 | 
				
			||||||
 | 
					    // Remove duplicates
 | 
				
			||||||
 | 
					    stopWords = [...new Set(stopWords)]
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return stopWords
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user