๋ฌธ์„œ๋ฅผ ์ˆ˜์ •ํ•˜๋ ค๋ฉด ๋กœ๊ทธ์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.

๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด/์กฐ์‚ฌ/์–ด๋ฏธ ์ฒ˜๋ฆฌ ์œ ํ‹ธ ์†Œ๊ฐœ

์ตœ๊ทผ ์ˆ˜์ • ์‹œ๊ฐ: 2025-09-03 11:41:51

๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด/์กฐ์‚ฌ/์–ด๋ฏธ ์ฒ˜๋ฆฌ ์œ ํ‹ธ ์†Œ๊ฐœ

ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ ๋งˆ์ด๋‹์ด๋‚˜ NLP ์ „์ฒ˜๋ฆฌ๋ฅผ ํ•˜๋‹ค ๋ณด๋ฉด ๊ผญ ๋งž๋‹ฅ๋œจ๋ฆฌ๋Š” ๋ฌธ์ œ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค.
๋ฐ”๋กœ ๋ถˆ์šฉ์–ด(stopword)์™€ ์กฐ์‚ฌ/์–ด๋ฏธ ์ฒ˜๋ฆฌ ๋ฌธ์ œ์ž…๋‹ˆ๋‹ค.

์˜์–ด์—์„œ๋Š” ๋‹จ์ˆœํžˆ a, an, the, of, in โ€ฆ ๊ฐ™์€ ๋ถˆ์šฉ์–ด๋ฅผ ๊ฑธ๋Ÿฌ๋‚ด๋ฉด ๋˜์ง€๋งŒ, ํ•œ๊ตญ์–ด๋Š” ์กฐ์‚ฌ(์ด/๊ฐ€, ์„/๋ฅผ, ์€/๋Š” โ€ฆ)๋‚˜ ์–ด๋ฏธ(-๋‹ค, -๋‹ˆ๋‹ค, -ํ–ˆ์–ด์š” โ€ฆ)๊ฐ€ ๋‹จ์–ด ๋’ค์— ๋ถ™๋Š” ๊ต์ฐฉ ๊ตฌ์กฐ์ด๊ธฐ ๋•Œ๋ฌธ์— ์กฐ๊ธˆ ๋” ์ •๊ตํ•œ ์ฒ˜๋ฆฌ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.

์ด ๊ธ€์—์„œ๋Š” ๊ฐ„๋‹จํ•œ ๋ถˆ์šฉ์–ด ์‚ฌ์ „๊ณผ ์กฐ์‚ฌ/์–ด๋ฏธ ์ŠคํŠธ๋ฆฌํ•‘ ๋กœ์ง์„ ๊ตฌํ˜„ํ•œ ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด ์ฒ˜๋ฆฌ ์œ ํ‹ธ ๋ฅผ ์†Œ๊ฐœํ•ฉ๋‹ˆ๋‹ค.


1. ์ „์ฒด ์ฝ”๋“œ

์•„๋ž˜ TypeScript ์œ ํ‹ธ์„ ๊ทธ๋Œ€๋กœ ๊ฐ€์ ธ๋‹ค ์“ธ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

// eslint-disable
/**
 * ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด/์กฐ์‚ฌ/์–ด๋ฏธ ์ฒ˜๋ฆฌ ์œ ํ‹ธ (v2)
 * - ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๋ฐฐ์—ด -> ์ตœ์ข… Set๋กœ ๋ณ‘ํ•ฉ
 * - ์กฐ์‚ฌ/์–ด๋ฏธ ์ŠคํŠธ๋ฆฌํ•‘ ์˜ต์…˜
 * - ๊ตฌ์–ด/ํ‘œ์ค€ ๋™์‹œ ์ปค๋ฒ„
 */

// 2.1 ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์›์ฒœ ๋ชฉ๋ก
const JOSA = [
  '์ด','๊ฐ€','์„','๋ฅผ','์€','๋Š”','์˜','์—','์—์„œ','์—๊ฒŒ','๊ป˜','๊ป˜์„œ',
  '๋กœ','์œผ๋กœ','์™€','๊ณผ','๋ž‘','ํ•˜๊ณ ','๋„','๋งŒ','๋ฟ','๊นŒ์ง€','๋ถ€ํ„ฐ',
  '๋งˆ๋‹ค','๋งˆ์ €','์กฐ์ฐจ','๋ฐ–์—','์ด๋‚˜','๋‚˜','์ด๋‚˜๋งˆ','๋‚˜๋งˆ','๋งŒํผ','๋Œ€๋กœ','์ฒ˜๋Ÿผ','๊ฐ™์ด','๋ฟ๋งŒ',
];
const PRONOUNS = [
  '๋‚˜','๋„ˆ','์ €','๊ทธ','์ด','์šฐ๋ฆฌ','๋„ˆํฌ','์ €ํฌ',
  '๊ทธ๋“ค','์ด๋“ค','์ €๋“ค',
  '์ด๊ฒƒ','๊ทธ๊ฒƒ','์ €๊ฒƒ','์ด๊ฑฐ','๊ทธ๊ฑฐ','์ €๊ฑฐ',
  '์—ฌ๊ธฐ','๊ฑฐ๊ธฐ','์ €๊ธฐ',
  '๋ˆ„๊ตฌ','๋ˆ„๊ฐ€','์•„๋ฌด','์•„๋ฌด๋‚˜','์•„๋ฌด๋„',
  '๊ฑ”','์Ÿค','๊ฑ”๋„ค','์Ÿค๋„ค'
];
const CONJUNCTIONS = [
  '๊ทธ๋ฆฌ๊ณ ','๋ฐ','๋˜๋Š”','ํ˜น์€','๊ทธ๋Ÿฌ๋‚˜','ํ•˜์ง€๋งŒ','๊ทธ๋ž˜๋„',
  '๊ทธ๋Ÿฐ๋ฐ','๊ทผ๋ฐ','๊ทธ๋ž˜์„œ','๋”ฐ๋ผ์„œ','๊ทธ๋Ÿฌ๋‹ˆ๊นŒ','๊ทธ๋‹ˆ๊นŒ','์ฆ‰','๋˜','๋˜ํ•œ'
];
const FUNCTION_WORDS = [
  '๊ฒƒ','๊ฑฐ','์ˆ˜','๋“ฑ','๋ฐ','๋•Œ','๊ณณ','์‚ฌ๋žŒ','์ผ','๋ง','์ค‘','๋‚ด',
  '์ ','๊ฒฝ์šฐ','๋ถ€๋ถ„','์ƒํƒœ','์ •๋„','์ž๋ฃŒ','๋‚ด์šฉ','๋ฌธ์ œ','ํ˜•ํƒœ',
  '๊ด€๊ณ„','ํ˜„์žฌ','๋‹น์‹œ','์˜ˆ','์˜ˆ์‹œ','๋Œ€ํ‘œ','๊ธฐ๋ณธ','์ผ๋ฐ˜','๋Œ€ํ•ด','๊ด€๋ จ',
  '์œ„','์•„๋ž˜','์•ž','๋’ค','์•ˆ','๋ฐ–','์™ผ์ชฝ','์˜ค๋ฅธ์ชฝ','๊ฐ€์šด๋ฐ','์ค‘๊ฐ„','์‚ฌ์ด'
];
const TEMPORALS = [
  '์˜ค๋Š˜','์–ด์ œ','๋‚ด์ผ','์ง€๊ธˆ','๋ฐฉ๊ธˆ','์ตœ๊ทผ','์š”์ฆ˜','ํ˜„์žฌ','์˜ˆ์ „','๊ณผ๊ฑฐ','์ดํ›„','์ด์ „','์•ž์œผ๋กœ','๊ณง'
];
const NUMBERS_NATIVE = ['ํ•˜๋‚˜','๋‘˜','์…‹','๋„ท','๋‹ค์„ฏ','์—ฌ์„ฏ','์ผ๊ณฑ','์—ฌ๋Ÿ','์•„ํ™‰','์—ด','์Šค๋ฌผ','์„œ๋ฅธ','๋งˆํ”','์‰ฐ','์˜ˆ์ˆœ','์ผํ”','์—ฌ๋“ ','์•„ํ”'];
const NUMBERS_SINO = ['์ผ','์ด','์‚ผ','์‚ฌ','์˜ค','์œก','์น ','ํŒ”','๊ตฌ','์‹ญ','๋ฐฑ','์ฒœ','๋งŒ','์–ต','์กฐ'];
const ORDINALS = ['์ฒซ','์ฒซ์งธ','๋‘˜์งธ','์…‹์งธ','๋„ท์งธ','๋‹ค์„ฏ์งธ','์—ฌ์„ฏ์งธ','์ผ๊ณฑ์งธ','์—ฌ๋Ÿ์งธ','์•„ํ™‰์งธ','์—ด์งธ'];
const INTENSIFIERS = ['๋งค์šฐ','๋„ˆ๋ฌด','์•„์ฃผ','์ •๋ง','์ง„์งœ','์ฐธ','๊ฝค','์ƒ๋‹นํžˆ','๊ทธ๋‹ค์ง€','๋ณ„๋กœ','๊ฐ€์žฅ','๋”','๋œ','์ตœ๊ณ ','์ตœ๋Œ€','์ตœ์†Œ','์ตœ์ €','ํ‰๊ท '];
const COPULA_AUX = [
  '์ด๋‹ค','์•„๋‹ˆ๋‹ค','์žˆ๋‹ค','์—†๋‹ค','๋˜๋‹ค','ํ•˜๋‹ค','๊ฐ™๋‹ค','์‹ถ๋‹ค','์‹ถ์–ดํ•˜๋‹ค'
];
const ADJ_COMMON = [
  '์ข‹์€','๋‚˜์œ','์ค‘์š”ํ•œ','ํ•„์š”ํ•œ','์œ ์šฉํ•œ','ํšจ๊ณผ์ ์ธ','ํšจ์œจ์ ์ธ','์ ์ ˆํ•œ','์ ํ•ฉํ•œ','๋ถ€์ ์ ˆํ•œ','๋ถ€์ ํ•ฉํ•œ',
  '์ƒˆ๋กœ์šด','์˜ค๋ž˜๋œ','์ตœ์‹ ','๊ตฌ์‹','ํ˜„๋Œ€','์ „ํ†ต','๊ณ ์ „','ํ˜„๋Œ€์ ','์ „ํ†ต์ ','๋ณดํ†ต','ํ‰๋ฒ”','ํŠน๋ณ„','ํŠน์ˆ˜','์ผ๋ฐ˜'
];
const DETERMINERS = ['๋ชจ๋“ ','์ „์ฒด','๊ฐ','๊ฐœ๋ณ„','ํŠน์ •','๋‹ค์–‘ํ•œ','์—ฌ๋Ÿฌ','๊ฐ์ข…'];
const CONNECTIVES_NOUN = ['๋•Œ๋ฌธ','๋•Œ๋ฌธ์—','๋•๋ถ„','๋ฐ”๋žŒ์—','ํ†ตํ•ด','์„ํ†ตํ•ด','๋ฅผํ†ตํ•ด'];

const ENDINGS = [
  '๋‹ค','๋‹ˆ๋‹ค','์Šต๋‹ˆ๋‹ค','ํ•ฉ๋‹ˆ๋‹ค','ํ–ˆ๋‹ค','ํ•˜์˜€๋‹ค','ํ–ˆ๋‹ค๊ฐ€','ํ–ˆ๋‹ค๋ฉฐ','ํ•œ๋‹ค','ํ•œ๋‹ค๋ฉด',
  'ํ•ด์š”','ํ–ˆ์–ด์š”','ํ–ˆ์—ˆ์–ด์š”','ํ–ˆ์Œ','ํ•จ','์ž„','์ค‘','๋˜์—ˆ์Œ','๋˜์—ˆ์Šต๋‹ˆ๋‹ค'
];

// 2.3 ์˜ต์…˜ ํƒ€์ž…
export type StopwordOptions = {
  includeAdjectives?: boolean;
  includeEndingsStrip?: boolean;
  includeCopulaAux?: boolean;
  extraStopwords?: string[];
  excludeStopwords?: string[];
};

// 2.4 ๋ถˆ์šฉ์–ด ์„ธํŠธ ์ƒ์„ฑ
export function buildKoreanStopwordSet(opts: StopwordOptions = {}) {
  const {
    includeAdjectives = true,
    includeEndingsStrip = true,
    includeCopulaAux = true,
    extraStopwords = [],
    excludeStopwords = [],
  } = opts;

  const buckets: string[][] = [
    JOSA,
    PRONOUNS,
    CONJUNCTIONS,
    FUNCTION_WORDS,
    TEMPORALS,
    NUMBERS_NATIVE,
    NUMBERS_SINO,
    ORDINALS,
    INTENSIFIERS,
    DETERMINERS,
    CONNECTIVES_NOUN,
  ];
  if (includeCopulaAux) buckets.push(COPULA_AUX);
  if (includeAdjectives) buckets.push(ADJ_COMMON);
  if (extraStopwords.length) buckets.push(extraStopwords);

  const set = new Set<string>();
  for (const group of buckets) {
    for (let w of group) {
      if (!w) continue;
      set.add(w.trim());
      if (w === '๊ฒƒ') set.add('๊ฑฐ');
      if (w === '๊ทธ๋ฆฌ๊ณ ') set.add('๊ทธ๋ฆฌ๊ตฌ');
    }
  }
  for (const keep of excludeStopwords) set.delete(keep.trim());
  return { set, includeEndingsStrip };
}

// 2.5 ํ…์ŠคํŠธ ์ •๊ทœํ™”
export function normalizeKo(text: string) {
  return text
    .normalize('NFKC')
    .replace(/[โ€œโ€โ€˜โ€™]/g, '"')
    .replace(/[^\p{L}\p{N}\s]/gu, ' ')
    .replace(/\s+/g, ' ')
    .trim();
}

// ์กฐ์‚ฌ/์–ด๋ฏธ ํŒจํ„ด
const JOSA_PATTERN = new RegExp(
  `^(.*?)(?:${JOSA.map(x => x.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})$`
);
const ENDING_PATTERN = new RegExp(
  `^(.*?)(?:${ENDINGS.map(x => x.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})$`
);

// 2.6 ์กฐ์‚ฌ/์–ด๋ฏธ ์ŠคํŠธ๋ฆฌํ•‘
function stripJosa(token: string) {
  const m = token.match(JOSA_PATTERN);
  return m ? m[1] : token;
}
function stripEnding(token: string) {
  const m = token.match(ENDING_PATTERN);
  return m ? m : token;
}

// 2.7 ํ† ํฌ๋‚˜์ด์ฆˆ + ๋ถˆ์šฉ์–ด ํ•„ํ„ฐ
export function tokenizeAndFilterKo(input: string, opts?: StopwordOptions): string[] {
  const { set, includeEndingsStrip } = buildKoreanStopwordSet(opts);
  const text = normalizeKo(input);
  const roughTokens = text.split(' ').filter(Boolean);
  const out: string[] = [];

  for (let tk of roughTokens) {
    if (tk.length === 1 && set.has(tk)) continue;

    let stem = stripJosa(tk);
    if (includeEndingsStrip) stem = stripEnding(stem);

    if (stem === '๊ฑฐ') stem = '๊ฒƒ';
    if (set.has(stem)) continue;
    if (stem.length <= 1) continue;

    out.push(stem);
  }
  return out;
}

// 2.8 ์‚ฌ์šฉ ์˜ˆ์‹œ
/*
const example = "์šฐ๋ฆฌ ๋ชจ๋‘๊ฐ€ ์˜ค๋Š˜์€ ์ •๋ง ์ค‘์š”ํ•œ ๋‚ด์šฉ์„ ์ž์„ธํžˆ ๊ทธ๋ฆฌ๊ณ  ์ฒœ์ฒœํžˆ ์‚ดํŽด๋ด…์‹œ๋‹ค.";
console.log(tokenizeAndFilterKo(example));
// ์˜ˆ์ƒ: ["์ค‘์š”", "๋‚ด์šฉ", "์ž์„ธํžˆ", "์ฒœ์ฒœํžˆ", "์‚ดํŽด๋ณด"]
*/

2. ์‚ฌ์šฉ๋ฒ•

import { tokenizeAndFilterKo } from "./stopwords-ko";

const text = "์šฐ๋ฆฌ ๋ชจ๋‘๊ฐ€ ์˜ค๋Š˜์€ ์ •๋ง ์ค‘์š”ํ•œ ๋‚ด์šฉ์„ ์ž์„ธํžˆ ์‚ดํŽด๋ด…์‹œ๋‹ค.";
console.log(tokenizeAndFilterKo(text));

โžก๏ธ ์ถœ๋ ฅ ์˜ˆ์‹œ

["์ค‘์š”", "๋‚ด์šฉ", "์ž์„ธํžˆ", "์‚ดํŽด๋ณด"]

3. ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๋ถˆ์šฉ์–ด ๋ถ„๋ฅ˜

๋ถˆ์šฉ์–ด๋Š” ๋ชฉ์ ์— ๋”ฐ๋ผ ์„ ํƒ์ ์œผ๋กœ ์‚ฌ์šฉ๋  ์ˆ˜ ์žˆ๋„๋ก ์—ฌ๋Ÿฌ ์นดํ…Œ๊ณ ๋ฆฌ๋กœ ๋‚˜๋ˆ„์—ˆ์Šต๋‹ˆ๋‹ค:

  • ์กฐ์‚ฌ(JOSA): "์ด/๊ฐ€, ์„/๋ฅผ, ์€/๋Š”, ์™€/๊ณผ โ€ฆ"
  • ๋Œ€๋ช…์‚ฌ(PRONOUNS): "๋‚˜, ๋„ˆ, ์šฐ๋ฆฌ, ์ด๊ฒƒ, ์ €๊ฒƒ, ๋ˆ„๊ตฌ โ€ฆ"
  • ์ ‘์†์‚ฌ(CONJUNCTIONS): "๊ทธ๋ฆฌ๊ณ , ํ•˜์ง€๋งŒ, ๋”ฐ๋ผ์„œ โ€ฆ"
  • ๊ธฐ๋Šฅ์–ด(FUNCTION_WORDS): "๊ฒƒ, ์ˆ˜, ๊ฒฝ์šฐ, ๋ฌธ์ œ, ๊ด€๊ณ„ โ€ฆ"
  • ์‹œ๊ฐ„ ํ‘œํ˜„(TEMPORALS): "์˜ค๋Š˜, ๋‚ด์ผ, ์ง€๊ธˆ, ์š”์ฆ˜ โ€ฆ"
  • ์ˆ˜์‚ฌ(NUMBERS): ๊ณ ์œ ์–ด/ํ•œ์ž์–ด ์ˆ˜์‚ฌ ("ํ•˜๋‚˜, ๋‘˜โ€ฆ / ์ผ, ์ด, ์‚ผโ€ฆ")
  • ์ •๋„๋ถ€์‚ฌ(INTENSIFIERS): "๋งค์šฐ, ๋„ˆ๋ฌด, ์ •๋ง, ๊ฐ€์žฅ โ€ฆ"
  • ๊ณ„์‚ฌ/๋ณด์กฐ๋™์‚ฌ(COPULA_AUX): "์ด๋‹ค, ์•„๋‹ˆ๋‹ค, ์žˆ๋‹ค, ๋˜๋‹ค โ€ฆ"
  • ํ˜•์šฉ์‚ฌ(ADJ_COMMON): "์ค‘์š”ํ•œ, ์œ ์šฉํ•œ, ์ƒˆ๋กœ์šด, ์˜ค๋ž˜๋œ โ€ฆ"
  • ํ•œ์ •์‚ฌ(DETERMINERS): "๋ชจ๋“ , ๊ฐ, ํŠน์ •, ์—ฌ๋Ÿฌ โ€ฆ"

์˜ต์…˜์— ๋”ฐ๋ผ ํ˜•์šฉ์‚ฌ/๋ณด์กฐ๋™์‚ฌ ํฌํ•จ ์—ฌ๋ถ€๋ฅผ ๋‘์–ด, ๋„๋ฉ”์ธ๋ณ„ ์ปค์Šคํ„ฐ๋งˆ์ด์ฆˆ๊ฐ€ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.


4. ๋ถˆ์šฉ์–ด ์„ธํŠธ ๋นŒ๋“œ

์—ฌ๋Ÿฌ bucket์„ ํ•ฉ์ณ์„œ ์ตœ์ข… Set<string>์œผ๋กœ ๋งŒ๋“ญ๋‹ˆ๋‹ค.

const { set, includeEndingsStrip } = buildKoreanStopwordSet({
  includeAdjectives: true,   // ํ˜•์šฉ์‚ฌ ํฌํ•จ ์—ฌ๋ถ€
  includeEndingsStrip: true, // ์–ด๋ฏธ ์ŠคํŠธ๋ฆฌํ•‘ ์—ฌ๋ถ€
  includeCopulaAux: true,    // ๊ณ„์‚ฌ/๋ณด์กฐ๋™์‚ฌ ์ œ๊ฑฐ ์—ฌ๋ถ€
  extraStopwords: ['ํŠนํžˆ', '๊ทธ๋ฆฌ๊ณ ๋‚˜์„œ'], // ์‚ฌ์šฉ์ž ์ถ”๊ฐ€
  excludeStopwords: ['์ค‘์š”ํ•œ']            // ํŠน์ • ๋‹จ์–ด ์ œ์™ธ
});

์ด๋ ‡๊ฒŒ ํ•˜๋ฉด ํ•„์š”ํ•œ ๋ถˆ์šฉ์–ด๋งŒ ๊ฑธ๋Ÿฌ๋‚ด๊ณ  ์ค‘์š”ํ•œ ํ‚ค์›Œ๋“œ๋Š” ๋ณด์กดํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.


3. ์กฐ์‚ฌ/์–ด๋ฏธ ์ŠคํŠธ๋ฆฌํ•‘

์˜ˆ๋ฅผ ๋“ค์–ด "๋‚ด์šฉ์„"์ด๋ผ๋Š” ํ† ํฐ์„ ๋ฐ›์œผ๋ฉด, stripJosa()๊ฐ€ "๋‚ด์šฉ"๋งŒ ๋‚จ๊ธฐ๊ณ  ์กฐ์‚ฌ ์„์„ ์ œ๊ฑฐํ•ฉ๋‹ˆ๋‹ค.
๋งˆ์ฐฌ๊ฐ€์ง€๋กœ "์‚ดํŽด๋ด…์‹œ๋‹ค"๋Š” stripEnding()์„ ํ†ตํ•ด "์‚ดํŽด๋ณด"๊นŒ์ง€ ์ค„์–ด๋“ญ๋‹ˆ๋‹ค.

function stripJosa(token: string) {
  const m = token.match(JOSA_PATTERN);
  return m ? m[1] : token; 
}

5. ๋ฉ”์ธ ํŒŒ์ดํ”„๋ผ์ธ

์ตœ์ข…์ ์œผ๋กœ ํ…์ŠคํŠธ ์ž…๋ ฅ์„ ๋ฐ›์•„ ํ† ํฐํ™” + ์ •๊ทœํ™” + ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ + ์กฐ์‚ฌ/์–ด๋ฏธ ์ŠคํŠธ๋ฆฌํ•‘์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.

export function tokenizeAndFilterKo(input: string, opts?: StopwordOptions): string[] {
  const { set, includeEndingsStrip } = buildKoreanStopwordSet(opts);
  const text = normalizeKo(input);
  const roughTokens = text.split(' ').filter(Boolean);
  const out: string[] = [];

  for (let tk of roughTokens) {
    if (set.has(tk)) continue;       // ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
    let stem = stripJosa(tk);        // ์กฐ์‚ฌ ์ œ๊ฑฐ
    if (includeEndingsStrip) stem = stripEnding(stem); // ์–ด๋ฏธ ์ œ๊ฑฐ
    if (stem.length > 1 && !set.has(stem)) out.push(stem);
  }
  return out;
}

6. ํ™œ์šฉ ํฌ์ธํŠธ

  • ํ•œ๊ตญ์–ด ์ž์—ฐ์–ด ์ฒ˜๋ฆฌ(NLP) ์ „์ฒ˜๋ฆฌ ๋‹จ๊ณ„์—์„œ ๋ฐ”๋กœ ์‚ฌ์šฉ ๊ฐ€๋Šฅ
  • ๊ฒ€์ƒ‰์—”์ง„ ํ‚ค์›Œ๋“œ ์ถ”์ถœ / ํ† ํ”ฝ ๋ถ„์„ / TF-IDF / ์›Œ๋“œํด๋ผ์šฐ๋“œ ๋“ฑ ํ™œ์šฉ
  • ๋ถˆ์šฉ์–ด ๋ฆฌ์ŠคํŠธ๋ฅผ ๋„๋ฉ”์ธ(๋‰ด์Šค, ๋ธ”๋กœ๊ทธ, ๋ฆฌ๋ทฐโ€ฆ) ๋งž๊ฒŒ ํ™•์žฅ/์ถ•์†Œ ๊ฐ€๋Šฅ

7. ์ฃผ์˜์‚ฌํ•ญ

  • ์™„๋ฒฝํ•œ ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๊ฐ€ ์•„๋‹ˆ๋ฏ€๋กœ, ์ผ๋ถ€ ๋ณต์žกํ•œ ์กฐ์‚ฌยท์–ด๋ฏธ๋Š” ์ œ๊ฑฐ๊ฐ€ ๋ถˆ์™„์ „ํ•  ์ˆ˜ ์žˆ์Œ
  • "์ค‘์š”ํ•œ" โ†’ "์ค‘์š”"์™€ ๊ฐ™์ด ์–ด๊ทผ ๋ณต์›(stemming)์ด ํ•„์š”ํ•œ ๊ฒฝ์šฐ๋Š” ์ถ”๊ฐ€ ์ฒ˜๋ฆฌ๊ฐ€ ํ•„์š”
  • ํ•œ ๊ธ€์ž ๋‹จ์–ด ์ค‘ "๊ฒƒ" ๋“ฑ์„ ๋ณด์กดํ• ์ง€ ์—ฌ๋ถ€๋Š” ์ƒํ™ฉ์— ๋งž๊ฒŒ ์กฐ์ • ํ•„์š”

โœ๏ธ ๊ฒฐ๋ก :
์ด ์œ ํ‹ธ์€ ๊ฐ€๋ณ๊ฒŒ ์ ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๋ถˆ์šฉ์–ด ํ•„ํ„ฐ \& ์กฐ์‚ฌ/์–ด๋ฏธ ์ •๋ฆฌ๊ธฐ์ž…๋‹ˆ๋‹ค.
ํ’ˆ์‚ฌ ํƒœ๊น…์ด๋‚˜ ํ˜•ํƒœ์†Œ ๋ถ„์„๊ธฐ๊ฐ€ ๋ถ€๋‹ด์Šค๋Ÿฝ๊ฑฐ๋‚˜ ๊ณผํ•œ ๊ฒฝ์šฐ, ๋น ๋ฅด๊ฒŒ ์ „์ฒ˜๋ฆฌ ๋‹จ๊ณ„์—์„œ ์ ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ์‹ค์šฉ์ ์ธ ๋„๊ตฌ๋ผ๊ณ  ๋ณด์‹œ๋ฉด ๋ฉ๋‹ˆ๋‹ค.

๋ฌธ์„œ๋ฅผ ์ˆ˜์ •ํ•˜๋ ค๋ฉด ๋กœ๊ทธ์ธ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.