opensearch_langchain_hybridsearch_citations

https://github.com/jpdas/opensearch_langchain_hybridsearch_citations

Science Score: 31.0%

This score indicates how likely this project is to be science-related based on various indicators:

✓
CITATION.cff file
Found CITATION.cff file
✓
codemeta.json file
Found codemeta.json file
○
.zenodo.json file
○
DOI references
○
Academic publication links
○
Academic email domains
○
Institutional organization owner
○
JOSS paper metadata
○
Scientific vocabulary similarity
Unable to calculate vocabulary similarity

Last synced: 10 months ago · JSON representation ·

Repository

Basic Info

Host: GitHub
Owner: JPDas
Language: Python
Default Branch: main
Size: 0 Bytes

Statistics

Stars: 0
Watchers: 1
Forks: 0
Open Issues: 0
Releases: 0

Created over 1 year ago · Last pushed over 1 year ago

Metadata Files

Readme Citation

README.md

OPENSEARCHLANGCHAINHYBRIDSEARCH_CITATIONS

Owner

Name: Jyotiprakash Das
Login: JPDas
Kind: user
Location: Bangalore
Company: Motorola Solutions

Repositories: 1
Profile: https://github.com/JPDas

Citation (citation.py)

import sys
import os
import hashlib
import re


from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from typing import List
from loguru import logger
from dotenv import load_dotenv

load_dotenv()
# logger
logger.remove()
logger.add(sys.stdout, level=os.getenv("LOG_LEVEL", "INFO"))


def create_opensearch_vector_search_client(index_name, user_name, opensearch_password, embeddings_client, opensearch_endpoint, _is_aoss=False):
    docsearch = OpenSearchVectorSearch(
        index_name=index_name,
        embedding_function=embeddings_client,
        opensearch_url=opensearch_endpoint,
        http_auth=(user_name, opensearch_password),
        is_aoss=_is_aoss,
    )
    return docsearch


def generate_anchor_name(text):
    """Generates a unique anchor name from text using SHA-256."""
    hashed_text = hashlib.sha256(text.encode()).hexdigest()
    return f"quote-{hashed_text[:8]}"  # Use first 8 characters for brevity

def generate_citation(source_id, quote, base_url):
    """Generates a formatted citation with an anchor link."""
    anchor_name = generate_anchor_name(quote)
    link = f"{base_url}#{anchor_name}"
    return f"<citation><source_id>[{source_id}]</source_id><link>({link})</link></citation>"


# def update_citation_with_streaming(llm_stream, link_base):
#     """Processes an LLM streaming response and adds citations."""
#     full_response = ""
#     citations = []
#     in_citation_block = False  # Track if we are inside a citation block

#     for chunk in llm_stream:
#         text_chunk = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
#         full_response += text_chunk

#         # Regular expression to find potential quotes (improve as needed)
#         quote_matches = re.findall(r'"([^"]*)"', full_response) #finds the string between two double quotes

#         for quote in quote_matches:
#             # Check if this quote has already been cited
#             if not any(quote in cit["quote"] for cit in citations):

#                 # Simulate retrieving source ID (replace with your actual retrieval)
#                 source_id = f"Source-{len(citations) + 1}"  # Replace with actual source ID retrieval

#                 citation = {
#                     "source_id": source_id,
#                     "quote": quote,
#                     "formatted_citation": generate_citation(source_id, quote, link_base),
#                 }
#                 citations.append(citation)

#                 # Remove the quote from the response so it's not repeated.
#                 full_response = full_response.replace(f'"{quote}"', "")

#         # Check if we are at the end of the LLM stream
#         if chunk.get("choices", [{}])[0].get("finish_reason"):
#             cited_answer = "<cited_answer>\n"
#             cited_answer += f"    <answer>{full_response.strip()}</answer>\n"
#             cited_answer += "    <citations>\n"
#             for cit in citations:
#                 cited_answer += f"        {cit['formatted_citation']}\n"
#             cited_answer += "    </citations>\n"
#             cited_answer += "</cited_answer>"
#             yield cited_answer
#             return

#         yield text_chunk  # Yield the current chunk of the response


def update_citation_without_streaming(response, link_base):
    """Generates an LLM response with citations."""
       
    llm_response = response.get('answer')

    logger.info(f"llm response: {llm_response}")
    answer_match = re.search(r"<cited_answer>(.*?)</cited_answer>", llm_response, re.DOTALL)
    cited_answer_match = re.search(r"<citations>(.*?)</citations>", llm_response, re.DOTALL)

    cited_answer_content = cited_answer_match.group(0)

    logger.info(f"Cited answer : {cited_answer_content}")

    new_citations_string = "<citations>"

    citation_matches = re.findall(r"<citation>(.*?)</citation>", cited_answer_content, re.DOTALL)

    logger.info(f"citation_matches : {citation_matches}")
    for citation_content in citation_matches:
        source_id_match = re.search(r"<source_id>(.*?)</source_id>", citation_content)
        source_id = source_id_match.group(1) if source_id_match else None

        quote_match = re.search(r"<quote>(.*?)</quote>", citation_content)
        quote = quote_match.group(1) if quote_match else None

        new_citations_string += generate_citation(source_id, quote, link_base)

    new_citations_string += "</citations>"

    final_output = llm_response.replace(cited_answer_content,new_citations_string) #add answer to the final output.


    return final_output

if __name__ == "__main__":

    index_name = "my-test-index"
    host = 'localhost'
    port = 9200
    user_name = "admin"
    opensearch_password = ""
    opensearch_endpoint = f"http://{host}:{port}"

    link_base = "https://example.com/article.html"

    llm = ChatOpenAI(temperature = 0.0, model="gpt-4o-mini")
        
    embedding_client = OpenAIEmbeddings(model="text-embedding-ada-002")

    opensearch_vector_search_client = create_opensearch_vector_search_client(index_name, user_name, opensearch_password, embedding_client, opensearch_endpoint)

    prompt = ChatPromptTemplate.from_template("""
        If the context is not relevant, please answer the question by using your own knowledge about the topic. 
        If you don't know the answer, just say that you don't know, don't try to make up an answer. Don't include harmful content.

        Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that \
        justifies the answer and the ID of the quote article. Return a citation for every quote across all articles \
        that justify the answer. Use the following format for your final output:

        <cited_answer>
            <answer></answer>
            <citations>
                <h4>Sources Cited:</h4>
                <citation><source_id>[source_id]</source_id><quote><u>quote</u></quote></citation>
                <citation><source_id>[source_id]</source_id><quote><u>quote</u></quote></citation>
                ...
            </citations>
        </cited_answer>

        {context}
        Question: {input}
        Answer:""")
        
    docs_chain = create_stuff_documents_chain(llm, prompt)
    retrieval_chain = create_retrieval_chain(
        retriever=opensearch_vector_search_client.as_retriever(
            search_type="similarity",
            search_kwargs={'k': 5},
        ),
        combine_docs_chain = docs_chain,

    )

    question = "What is risk culture and adoption?"
    logger.info(f"Invoking the chain with KNN similarity using OpenSearch")
    response = retrieval_chain.invoke({"input": question})

    answer = response.get('answer')
    logger.info(f"The answer from llm is: {answer}")
    
    # response = update_citation_without_streaming(response, link_base)

    # logger.info(f"Final response: {response}")

GitHub Events

Total

Push event: 1
Create event: 1

Last Year

Push event: 1
Create event: 1

Dependencies

requirements.txt pypi

langchain *
langchain-community *
langchain-core *
langchain_openai *
loguru *
opensearch-py *
pandas *
pdfplumber *

ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Open Source Science