buggy Arabic diacritics removal

This commit is contained in:
Simon Cambier 2025-07-11 18:57:07 +02:00
parent 92a7af6ec5
commit 921a72b015

View File

@ -9,6 +9,7 @@ import {
import { md5 } from 'pure-md5' import { md5 } from 'pure-md5'
import { eng, fra } from 'stopword' import { eng, fra } from 'stopword'
import { isSearchMatch, type SearchMatch } from '../globals' import { isSearchMatch, type SearchMatch } from '../globals'
import { settings } from 'src/settings'
export function pathWithoutFilename(path: string): string { export function pathWithoutFilename(path: string): string {
const split = path.split('/') const split = path.split('/')
@ -121,11 +122,12 @@ export function removeDiacritics(str: string, arabic = true): string {
return '' return ''
} }
// TODO: add a global setting to toggle this, this impacts performances
if (arabic) { if (arabic) {
// Arabic diacritics // Arabic diacritics
// https://stackoverflow.com/a/40959537 // https://stackoverflow.com/a/40959537
str = str str = str
.replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '') // .replace(/([^\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z 0-9])/g, '')
.replace(/(آ|إ|أ)/g, 'ا') .replace(/(آ|إ|أ)/g, 'ا')
.replace(/(ة)/g, 'ه') .replace(/(ة)/g, 'ه')
.replace(/(ئ|ؤ)/g, 'ء') .replace(/(ئ|ؤ)/g, 'ء')
@ -144,6 +146,7 @@ export function removeDiacritics(str: string, arabic = true): string {
str = str.normalize('NFD').replace(diacriticsRegex, '').normalize('NFC') str = str.normalize('NFD').replace(diacriticsRegex, '').normalize('NFC')
str = str.replaceAll('[__locator__backtick__]', '`') str = str.replaceAll('[__locator__backtick__]', '`')
str = str.replaceAll('[__locator__caret__]', '^') str = str.replaceAll('[__locator__caret__]', '^')
return str return str
} }