Enhancing Retrieval-Augmented Generation (RAG) for Thai Content with Semantic Chunking

June 04, 2024

In my previous post, "Experimenting with Retrieval-Augmented Generation (RAG) for Thai Content", I noticed that the content stored in a vector database was chunked by the SentenceTokenSplitter, which created many small content pieces. The meaning of each piece of content was very limited, and most of the time, the search results from the vector database were not thorough. Consequently, the responses from the RAG-LLM were not as expected.

To address this, I researched different chunking strategies and found this insightful article. After reading through, I felt that semantic chunking made more sense. It groups content into semantically close sentences instead of many single sentences. Additionally, the results from the SentenceTokenSplitter did not maintain their integrity, often losing important white spaces. Therefore, I decided to replace the SentenceTokenSplitter with sent_tokenize from pythainlp and the semantic chunking method explained in the article.

Implementation Changes

After updating the code, my local LM Studio Inference Server could no longer handle the larger chunks returned from the vector database due to resource limitations. Consequently, I switched to the Google GenAI platform, including Google’s embedding function. Below is the updated code:


from langchain_community.document_loaders import PyMuPDFLoader
from pythainlp import sent_tokenize
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage, AIMessage
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import numpy as np

def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '
        combined_sentence += sentences[i]['sentence']
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        sentences[i]['combined_sentence'] = combined_sentence
    return sentences

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]['distance_to_next'] = distance
    return distances, sentences

def gradio_vectorize_and_store(pdf_file):
    loader = PyMuPDFLoader(pdf_file.name)
    docs = loader.load()
    long_sent = " ".join([doc.page_content for doc in docs])
    short_sent = sent_tokenize(long_sent)
    sentences = [{'sentence': sent, 'index': i} for i, sent in enumerate(short_sent)]
    sentences = combine_sentences(sentences)

    ggaiembeds = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    batch_size = 100
    batched_sentences = [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]
    
    all_embeddings = []
    for batch in batched_sentences:
        embeddings = ggaiembeds.embed_documents([x['combined_sentence'] for x in batch])
        all_embeddings.extend(embeddings)
    
    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = all_embeddings[i]
    distances, sentences = calculate_cosine_distances(sentences)

    start_index = 0
    chunks = []
    breakpoint_percentile_threshold = 95
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]

    for index in indices_above_thresh:
        end_index = index
        group = sentences[start_index:end_index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        start_index = index + 1

    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
    
    url = "http://localhost:6333"
    qdrant = Qdrant.from_texts(
        chunks,
        ggaiembeds,
        url=url,
        prefer_grpc=True,
        collection_name="my_documents",
        force_recreate=True,
    )
    return "Processed and stored the vector database successfully!"

def gradio_query(query, history):
    url = "http://localhost:6333"
    client = QdrantClient(url)
    ggaiembeds = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    collection_name = "my_documents"
    qdrant = Qdrant(client, collection_name, ggaiembeds)

    retriever = qdrant.as_retriever(search_type="mmr", search_kwargs=dict(k=4, fetch_k=20))

    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

    contextualize_q_system_prompt = """Reformulate the latest user question into a standalone question that does not require previous context. Do NOT answer the question; only rephrase it if needed."""
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
    )

    qa_system_prompt = """You are an assistant for question-answering tasks. Use the provided context to answer the question. If you don't know the answer, say so. Keep the answer concise, using a maximum of three sentences.
    {context}"""
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", qa_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

    chat_history = []
    for (question, answer) in history:
        chat_history.extend([HumanMessage(content=question), AIMessage(content=answer)])
    
    ai_msg = rag_chain.invoke({"input": query, "chat_history": chat_history})

    history.append((query, ai_msg["answer"]))
    return "", history

with gr.Blocks() as app:
    with gr.Tab("Vectorize and Store"):
        gr.Markdown("## Upload PDF File to Vectorize and Store")
        pdf_file_input = gr.File(label="Upload PDF File", type="filepath")
        vectorize_button = gr.Button("Vectorize and Store")
        vectorize_output = gr.Textbox(label="Status")

        vectorize_button.click(gradio_vectorize_and_store, inputs=pdf_file_input, outputs=vectorize_output)

    with gr.Tab("Chat with Me"):
        gr.Markdown("## Message to Chat with Me")
        chatbot = gr.Chatbot()
        msg = gr.Textbox()
        clear = gr.ClearButton([msg, chatbot])

        msg.submit(gradio_query, [msg, chatbot], [msg, chatbot])

app.launch()
import gradio as gr

Results and Observations

After running the updated code, I observed a significant improvement in performance as the LLM now runs on the Google Platform. The relevancy of the responses from the LLM has also improved, thanks to the semantic chunking method.

e49.space