๋ฉ”์ธ ์ฝ˜ํ…์ธ ๋กœ ๊ฑด๋„ˆ๋›ฐ๊ธฐ

Sonamu์—์„œ ๊ธด ๋ฌธ์„œ ์ฒ˜๋ฆฌํ•˜๊ธฐ

Sonamu ์•ฑ์— ๋ธ”๋กœ๊ทธ ๊ธ€์ด๋‚˜ ๋งค๋‰ด์–ผ์„ ์—…๋กœ๋“œํ•˜๋Š” ๊ธฐ๋Šฅ์„ ๋งŒ๋“ค๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค:
class DocumentModelClass extends BaseModel {
  @upload({ mode: 'single' })
  @api({ httpMethod: 'POST' })
  async uploadDocument() {
    const { file } = Sonamu.getUploadContext();
    const content = await file.toBuffer().then(b => b.toString());
    
    // ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์‹œ๋„
    const embedding = await Embedding.embedOne(content, 'voyage', 'document');
    // โŒ ์—๋Ÿฌ: ํ† ํฐ ์ œํ•œ ์ดˆ๊ณผ (32,000 ํ† ํฐ)
  }
}
๋ฌธ์ œ:
  • ๊ธด ๋ฌธ์„œ (10,000๋‹จ์–ด ์ด์ƒ)
  • ์ž„๋ฒ ๋”ฉ API ํ† ํฐ ์ œํ•œ ์ดˆ๊ณผ
  • ์ „์ฒด๋ฅผ ํ•œ ๋ฒˆ์— ์ž„๋ฒ ๋”ฉ ๋ถˆ๊ฐ€๋Šฅ
ํ•ด๊ฒฐ์ฑ…: ์ฒญํ‚น(Chunking) - ๋ฌธ์„œ๋ฅผ ์ž‘์€ ์กฐ๊ฐ์œผ๋กœ ๋‚˜๋ˆ„๊ธฐ

์ฒญํ‚น์ด๋ž€?

ํ•ต์‹ฌ:
  • ๊ธด ๋ฌธ์„œ โ†’ ์—ฌ๋Ÿฌ ์ฒญํฌ
  • ๊ฐ ์ฒญํฌ โ†’ ๊ฐœ๋ณ„ ์ž„๋ฒ ๋”ฉ
  • ๊ฒ€์ƒ‰ ์‹œ โ†’ ๊ฐ€์žฅ ๊ด€๋ จ ์žˆ๋Š” ์ฒญํฌ ๋ฐ˜ํ™˜

์™œ ํ•„์š”ํ•œ๊ฐ€?

1. ํ† ํฐ ์ œํ•œ
  • Voyage AI: 32,000 ํ† ํฐ
  • OpenAI: 8,191 ํ† ํฐ
  • ๊ธด ๋ฌธ์„œ๋Š” ์ œํ•œ ์ดˆ๊ณผ
2. ๊ฒ€์ƒ‰ ์ •ํ™•๋„
  • ์งง์€ ์ฒญํฌ๊ฐ€ ๋” ์ •ํ™•ํ•œ ๊ฒฐ๊ณผ
  • โ€œํ™˜๋ถˆ ๋ฐฉ๋ฒ•โ€๊ฒ€์ƒ‰ ์‹œ โ†’ ํ™˜๋ถˆ ์„น์…˜๋งŒ ๋ฐ˜ํ™˜
3. ์ปจํ…์ŠคํŠธ ๋ณด์กด
  • ๊ด€๋ จ ์ •๋ณด๋ฅผ ํ•จ๊ป˜ ์œ ์ง€
  • ๋ฌธ์žฅ์ด ๋Š๊ธฐ์ง€ ์•Š๊ฒŒ ๋ถ„ํ• 

Sonamu์˜ Chunking ํด๋ž˜์Šค

import { Chunking } from "sonamu/vector";

const chunking = new Chunking({
  chunkSize: 500,        // ์ฒญํฌ ํฌ๊ธฐ (๋ฌธ์ž)
  chunkOverlap: 50,      // ์ค‘๋ณต ํฌ๊ธฐ
  minChunkSize: 50,      // ์ตœ์†Œ ํฌ๊ธฐ
  skipThreshold: 200,    // ์งง์œผ๋ฉด ๋ถ„ํ•  ์Šคํ‚ต
  separators: ['\n\n', '\n', '. '],  // ๊ตฌ๋ถ„์ž
});

const chunks = chunking.chunk("๊ธด ํ…์ŠคํŠธ...");

Sonamu Model์—์„œ ์‚ฌ์šฉํ•˜๊ธฐ

๊ธด ๋ฌธ์„œ ์—…๋กœ๋“œ + ์ฒญํ‚น

class DocumentModelClass extends BaseModel {
  @upload({ mode: 'single' })
  @api({ httpMethod: 'POST' })
  async uploadLongDocument() {
    const { file } = Sonamu.getUploadContext();
    const content = await file.toBuffer().then(b => b.toString());
    
    // 1. ์ฒญํ‚น
    const chunking = new Chunking({
      chunkSize: 500,
      chunkOverlap: 50,
    });
    
    const chunks = chunking.chunk(content);
    
    // 2. ๊ฐ ์ฒญํฌ๋ณ„ ์ž„๋ฒ ๋”ฉ
    const embeddings = await Embedding.embed(
      chunks.map(c => c.text),
      'voyage',
      'document'
    );
    
    // 3. ๋ถ€๋ชจ ๋ฌธ์„œ ์ƒ์„ฑ
    const parent = await this.saveOne({
      title: file.filename,
      content,
      chunk_count: chunks.length,
    });
    
    // 4. ์ฒญํฌ๋ณ„ ์ €์žฅ
    const savedChunks = await Promise.all(
      chunks.map((chunk, i) => 
        DocumentChunkModel.saveOne({
          parent_id: parent.id,
          chunk_index: chunk.index,
          content: chunk.text,
          start_offset: chunk.startOffset,
          end_offset: chunk.endOffset,
          embedding: embeddings[i].embedding,
        })
      )
    );
    
    return {
      parentId: parent.id,
      chunkCount: chunks.length,
    };
  }
}

ํ…Œ์ด๋ธ” ๊ตฌ์กฐ

-- ๋ถ€๋ชจ ๋ฌธ์„œ
CREATE TABLE documents (
  id SERIAL PRIMARY KEY,
  title TEXT NOT NULL,
  content TEXT NOT NULL,
  chunk_count INTEGER,
  created_at TIMESTAMP DEFAULT NOW()
);

-- ์ฒญํฌ (์ž„๋ฒ ๋”ฉ ์ €์žฅ)
CREATE TABLE document_chunks (
  id SERIAL PRIMARY KEY,
  parent_id INTEGER REFERENCES documents(id),
  chunk_index INTEGER,
  content TEXT NOT NULL,
  start_offset INTEGER,
  end_offset INTEGER,
  embedding vector(1024),
  created_at TIMESTAMP DEFAULT NOW()
);

CREATE INDEX ON document_chunks (parent_id);
CREATE INDEX ON document_chunks USING hnsw (embedding vector_cosine_ops);

์„ค์ • ์˜ต์…˜ ์ดํ•ดํ•˜๊ธฐ

chunkSize: ์ฒญํฌ ํฌ๊ธฐ

const chunking = new Chunking({
  chunkSize: 500,  // 500์ž
});
๊ถŒ์žฅ ๊ฐ’:
  • ์งง์€ ๊ฒ€์ƒ‰: 200-300์ž
  • ์ผ๋ฐ˜์ : 400-600์ž
  • ๊ธด ์ปจํ…์ŠคํŠธ: 800-1000์ž
๊ณ ๋ ค ์‚ฌํ•ญ:
  • ํ•œ๊ตญ์–ด: ~1์ž = ~1ํ† ํฐ
  • ์˜์–ด: ~1์ž = ~0.7ํ† ํฐ

chunkOverlap: ์ค‘๋ณต ํฌ๊ธฐ

const chunking = new Chunking({
  chunkSize: 500,
  chunkOverlap: 50,  // 10%
});
์—ญํ• : ์ฒญํฌ ๊ฒฝ๊ณ„์—์„œ ๋ฌธ๋งฅ ์œ ์ง€
์ฒญํฌ 1: [..............................]
์ฒญํฌ 2:                    [...........]
                           โ†‘ ์ค‘๋ณต ์˜์—ญ
๊ถŒ์žฅ: chunkSize์˜ 10-20%

skipThreshold: ๋ถ„ํ•  ์Šคํ‚ต

const chunking = new Chunking({
  skipThreshold: 200,
});

// 200์ž ์ดํ•˜๋Š” ๋ถ„ํ•  ์•ˆ ํ•จ
const text = "์งง์€ ๊ธ€";  // 50์ž
const chunks = chunking.chunk(text);  // [์ „์ฒด ํ…์ŠคํŠธ 1๊ฐœ]
ํšจ์œจ: ์งง์€ ๋ฌธ์„œ๋Š” ์ฒญํ‚น ๋ถˆํ•„์š”

separators: ๊ตฌ๋ถ„์ž ์šฐ์„ ์ˆœ์œ„

const chunking = new Chunking({
  separators: [
    '\n\n',  // 1์ˆœ์œ„: ๋‹จ๋ฝ
    '\n',    // 2์ˆœ์œ„: ์ค„
    '. ',    // 3์ˆœ์œ„: ๋ฌธ์žฅ
    ', ',    // 4์ˆœ์œ„: ์‰ผํ‘œ
  ],
});
๋™์ž‘: ์™ผ์ชฝ๋ถ€ํ„ฐ ์‹œ๋„

์‹ค์ „ ์‹œ๋‚˜๋ฆฌ์˜ค

์‹œ๋‚˜๋ฆฌ์˜ค: ๊ธฐ์ˆ  ๋ฌธ์„œ ์ง€์‹ ๋ฒ ์ด์Šค

Sonamu๋กœ ๊ฐœ๋ฐœ ๋ฌธ์„œ ๊ฒ€์ƒ‰ ์‹œ์Šคํ…œ์„ ๋งŒ๋“ค๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. 1๋‹จ๊ณ„: ์กฐ๊ฑด๋ถ€ ์ฒญํ‚น
@upload({ mode: 'single' })
@api({ httpMethod: 'POST' })
async uploadTechDoc() {
  const { file } = Sonamu.getUploadContext();
  const content = await file.toBuffer().then(b => b.toString());
  
  const chunking = new Chunking({
    chunkSize: 500,
    skipThreshold: 300,
  });
  
  // ์งง์œผ๋ฉด ๊ทธ๋Œ€๋กœ, ๊ธธ๋ฉด ์ฒญํ‚น
  if (chunking.needsChunking(content)) {
    return await this.uploadWithChunking(file.filename, content);
  } else {
    return await this.uploadSimple(file.filename, content);
  }
}

private async uploadSimple(title: string, content: string) {
  const embedding = await Embedding.embedOne(
    `${title}\n\n${content}`,
    'voyage',
    'document'
  );
  
  return await this.saveOne({
    title,
    content,
    embedding: embedding.embedding,
  });
}

private async uploadWithChunking(title: string, content: string) {
  // ์œ„์˜ ์˜ˆ์ œ์™€ ๋™์ผ
  const chunking = new Chunking({ chunkSize: 500 });
  const chunks = chunking.chunk(content);
  // ...
}
2๋‹จ๊ณ„: ๊ฒ€์ƒ‰ (์ฒญํฌ ๊ธฐ๋ฐ˜)
@api({ httpMethod: 'POST' })
async searchDocs(query: string, limit: number = 5) {
  const embedding = await Embedding.embedOne(query, 'voyage', 'query');
  
  // ์ฒญํฌ ๊ฒ€์ƒ‰
  const chunks = await this.getPuri().raw(`
    SELECT 
      c.id, c.parent_id, c.content, c.chunk_index,
      d.title,
      1 - (c.embedding <=> ?) AS similarity
    FROM document_chunks c
    JOIN documents d ON c.parent_id = d.id
    WHERE c.embedding IS NOT NULL
    ORDER BY c.embedding <=> ?
    LIMIT ?
  `, [
    JSON.stringify(embedding.embedding),
    JSON.stringify(embedding.embedding),
    limit * 2,
  ]);
  
  // ๋ถ€๋ชจ ๋ฌธ์„œ๋ณ„ ๊ทธ๋ฃนํ™”
  const grouped = new Map();
  
  for (const chunk of chunks.rows) {
    const parentId = chunk.parent_id;
    
    if (!grouped.has(parentId)) {
      grouped.set(parentId, {
        parentId,
        title: chunk.title,
        bestSimilarity: chunk.similarity,
        relevantChunks: [],
      });
    }
    
    grouped.get(parentId).relevantChunks.push({
      content: chunk.content,
      chunkIndex: chunk.chunk_index,
      similarity: chunk.similarity,
    });
  }
  
  return Array.from(grouped.values())
    .sort((a, b) => b.bestSimilarity - a.bestSimilarity)
    .slice(0, limit);
}
์‘๋‹ต ์˜ˆ์‹œ:
[
  {
    "parentId": 123,
    "title": "TypeScript ์‹œ์ž‘ํ•˜๊ธฐ",
    "bestSimilarity": 0.89,
    "relevantChunks": [
      {
        "content": "TypeScript๋Š” ํƒ€์ž…์ด ์žˆ๋Š” JavaScript์ž…๋‹ˆ๋‹ค...",
        "chunkIndex": 2,
        "similarity": 0.89
      }
    ]
  }
]

๋งˆํฌ๋‹ค์šด ๋ฌธ์„œ์— ์ตœ์ ํ™”

const markdownChunking = new Chunking({
  chunkSize: 600,
  separators: [
    '\n## ',   // ํ—ค๋”ฉ 2
    '\n### ',  // ํ—ค๋”ฉ 3
    '\n\n',    // ๋‹จ๋ฝ
    '\n',      // ์ค„
    '. ',      // ๋ฌธ์žฅ
  ],
});

const markdown = `
# Sonamu

## ๊ฐœ์š”
Sonamu๋Š” TypeScript ํ”„๋ ˆ์ž„์›Œํฌ์ž…๋‹ˆ๋‹ค.

## ์„ค์น˜
\`\`\`bash
pnpm add sonamu
\`\`\`
`;

const chunks = markdownChunking.chunk(markdown);
ํšจ๊ณผ: ํ—ค๋”ฉ ๋‹จ์œ„๋กœ ๋ถ„ํ•  โ†’ ๋ฌธ๋งฅ ๋ณด์กด

์ฒญํ‚น vs ์ „์ฒด ๋ฌธ์„œ

์–ธ์ œ ์ฒญํ‚น์ด ํ•„์š”ํ•œ๊ฐ€?

๋ฌธ์„œ ํƒ€์ž…ํ‰๊ท  ๊ธธ์ด์ฒญํ‚น ํ•„์š”์ด์œ 
FAQ ํ•ญ๋ชฉ< 200์žโŒ์งง์Œ
๋ธ”๋กœ๊ทธ ๊ธ€1,000์ž์„ ํƒ์ค‘๊ฐ„
๊ธฐ์ˆ  ๋ฌธ์„œ5,000์žโœ…๊น€
๋งค๋‰ด์–ผ20,000์žโœ… ํ•„์ˆ˜๋งค์šฐ ๊น€
์ฑ„ํŒ… ๋ฉ”์‹œ์ง€< 100์žโŒ์งง์Œ

Sonamu์—์„œ์˜ ํŒ๋‹จ

const chunking = new Chunking({
  skipThreshold: 300,  // 300์ž ์ดํ•˜๋Š” ์Šคํ‚ต
});

if (chunking.needsChunking(content)) {
  // ์ฒญํ‚น ์ฒ˜๋ฆฌ
} else {
  // ๊ทธ๋Œ€๋กœ ์ฒ˜๋ฆฌ
}

์ฃผ์˜์‚ฌํ•ญ

Sonamu์—์„œ ์ฒญํ‚น ์‚ฌ์šฉ ์‹œ ์ฃผ์˜์‚ฌํ•ญ:
  1. chunkSize ๋„ˆ๋ฌด ์ž‘์ง€ ์•Š๊ฒŒ
    // โŒ ๋„ˆ๋ฌด ์ž‘์Œ
    chunkSize: 50
    
    // โœ… ์ ์ ˆ
    chunkSize: 400-600
    
  2. chunkOverlap ์ ์ ˆํžˆ
    // ๊ถŒ์žฅ: 10-20%
    chunkSize: 500,
    chunkOverlap: 50,
    
  3. separators ์ˆœ์„œ ์ค‘์š”
    // โœ… ํฐ ๋‹จ์œ„๋ถ€ํ„ฐ
    separators: ['\n\n', '\n', '. ']
    
    // โŒ ์ž‘์€ ๋‹จ์œ„๋ถ€ํ„ฐ
    separators: [' ', '.', '\n']
    
  4. ๋ถ€๋ชจ-์ž์‹ ๊ด€๊ณ„ ์œ ์ง€
    -- parent_id๋กœ ์—ฐ๊ฒฐ
    CREATE TABLE document_chunks (
      parent_id INTEGER REFERENCES documents(id)
    );
    
  5. ๊ฒ€์ƒ‰ ์‹œ ์ค‘๋ณต ์ œ๊ฑฐ
    // ๊ฐ™์€ ๋ฌธ์„œ์˜ ์—ฌ๋Ÿฌ ์ฒญํฌ โ†’ ํ•˜๋‚˜๋กœ ๊ทธ๋ฃนํ™”
    const grouped = new Map();
    
  6. offset ์ €์žฅ (์„ ํƒ)
    // ์›๋ณธ ์œ„์น˜ ์ถ”์ 
    start_offset: chunk.startOffset,
    end_offset: chunk.endOffset,
    

๋‹ค์Œ ๋‹จ๊ณ„