indian-legal-document-analyzer
A RAG-based question-answering system for legal documents
https://github.com/shubham00-3/indian-legal-document-analyzer
Science Score: 44.0%
This score indicates how likely this project is to be science-related based on various indicators:
-
✓CITATION.cff file
Found CITATION.cff file -
✓codemeta.json file
Found codemeta.json file -
✓.zenodo.json file
Found .zenodo.json file -
○DOI references
-
○Academic links in README
-
○Academic email domains
-
○Institutional organization owner
-
○JOSS paper metadata
-
○Scientific vocabulary similarity
Low similarity (0.4%) to scientific vocabulary
Last synced: 6 months ago
·
JSON representation
·
Repository
A RAG-based question-answering system for legal documents
Basic Info
- Host: GitHub
- Owner: Shubham00-3
- Language: Python
- Default Branch: master
- Homepage: https://indian-legal-document-analyzer.vercel.app
- Size: 0 Bytes
Statistics
- Stars: 1
- Watchers: 1
- Forks: 0
- Open Issues: 0
- Releases: 0
Created 10 months ago
· Last pushed 10 months ago
Metadata Files
Readme
Citation
Owner
- Name: Shubham Gangwar
- Login: Shubham00-3
- Kind: user
- Repositories: 1
- Profile: https://github.com/Shubham00-3
Citation (citation_analyzer.py)
# citation_analyzer.py
# Analyzes and extracts citation networks from legal documents
import re
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import io
import base64
class CitationAnalyzer:
def __init__(self):
"""Initialize the citation analyzer"""
# Regular expressions for detecting various citation types
self.patterns = {
"case_citations": [
r'\(\d{4}\)\s+\d+\s+SCC\s+\d+', # SCC format
r'\d{4}\s+\(\d+\)\s+SCC\s+\d+', # Alternative SCC format
r'AIR\s+\d{4}\s+SC\s+\d+', # AIR SC format
r'\(\d{4}\)\s+\d+\s+SCR\s+\d+', # SCR format
],
"statute_citations": [
r'(?:The\s+)?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+Act,?\s+(?:of\s+)?\d{4})', # Act format
r'(?:The\s+)?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+Rules,?\s+(?:of\s+)?\d{4})', # Rules format
r'(?:The\s+)?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+Regulation,?\s+(?:of\s+)?\d{4})', # Regulation format
],
"constitution_citations": [
r'[Aa]rticle\s+(\d+(?:\(\d+\))?(?:\([a-z]\))?)', # Basic pattern
r'[Aa]rticle\s+(\d+)(?:\((\d+)(?:\(([a-z])\))?)?\)', # More specific pattern to capture structured references
r'[Aa]rticle\s+(\d+(?:\(\d+\))?(?:\([a-z]\))?)\s+of\s+the\s+Constitution',
],
"section_citations": [
r'Section\s+(\d+(?:\([a-z]\))?)\s+of\s+the\s+([A-Za-z\s]+)', # Section format
r'under\s+[Ss]ection\s+(\d+(?:\([a-z]\))?)', # Alternative section format
]
}
def extract_citations(self, text):
"""
Extract all citations from text
Parameters:
- text: Document text
Returns:
- Dictionary with citation types and matches
"""
citations = {}
for citation_type, patterns in self.patterns.items():
matches = []
for pattern in patterns:
found = re.findall(pattern, text)
# Handle both string matches and tuple matches from regex groups
for match in found:
if isinstance(match, tuple):
# For multi-group matches, join with a space
matches.append(" ".join([m for m in match if m]))
else:
matches.append(match)
# Remove duplicates while preserving order
unique_matches = []
for item in matches:
if item not in unique_matches:
unique_matches.append(item)
citations[citation_type] = unique_matches
return citations
def build_citation_network(self, documents):
"""
Build a citation network from multiple documents
Parameters:
- documents: List of (doc_id, text, metadata) tuples
Returns:
- NetworkX graph object
"""
G = nx.DiGraph()
# Extract citations from each document
doc_citations = {}
for doc_id, text, metadata in documents:
doc_name = metadata.get('filename', doc_id)
G.add_node(doc_name, type='document')
citations = self.extract_citations(text)
doc_citations[doc_name] = citations
# Add citation nodes
for citation_type, matches in citations.items():
for citation in matches:
citation_str = f"{citation_type}:{citation}"
if not G.has_node(citation_str):
G.add_node(citation_str, type=citation_type)
G.add_edge(doc_name, citation_str)
# Add connections between documents that cite the same sources
citation_to_docs = defaultdict(list)
for doc_name, citations in doc_citations.items():
for citation_type, matches in citations.items():
for citation in matches:
citation_str = f"{citation_type}:{citation}"
citation_to_docs[citation_str].append(doc_name)
# Connect documents that share citations
for citation, citing_docs in citation_to_docs.items():
if len(citing_docs) > 1:
for i in range(len(citing_docs)):
for j in range(i+1, len(citing_docs)):
G.add_edge(citing_docs[i], citing_docs[j], shared=citation)
return G
def plot_citation_network(self, graph):
"""
Plot the citation network visualization as a base64 encoded image
Parameters:
- graph: NetworkX graph object
Returns:
- Base64 encoded image string for embedding in HTML
"""
plt.figure(figsize=(12, 8))
# Node colors based on type
colors = []
for node in graph.nodes():
node_type = graph.nodes[node].get('type', '')
if node_type == 'document':
colors.append('skyblue')
elif node_type == 'case_citations':
colors.append('tomato')
elif node_type == 'statute_citations':
colors.append('lightgreen')
elif node_type == 'constitution_citations':
colors.append('gold')
elif node_type == 'section_citations':
colors.append('orchid')
else:
colors.append('gray')
# Draw the network
pos = nx.spring_layout(graph, seed=42)
nx.draw_networkx_nodes(graph, pos, node_size=700, node_color=colors, alpha=0.8)
nx.draw_networkx_edges(graph, pos, edge_color='gray', alpha=0.5, arrows=True)
# Adjust labels to abbreviate long nodes
labels = {}
for node in graph.nodes():
if len(node) > 30:
labels[node] = node[:27] + "..."
else:
labels[node] = node
nx.draw_networkx_labels(graph, pos, labels=labels, font_size=8)
plt.axis('off')
plt.tight_layout()
# Save figure to a bytes buffer
buf = io.BytesIO()
plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
plt.close()
# Encode the bytes as base64 for HTML embedding
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
return img_str
def generate_citation_report(self, text):
"""
Generate a report of citations in a document
Parameters:
- text: Document text
Returns:
- Dictionary with citation analysis
"""
citations = self.extract_citations(text)
# Count citation types
citation_counts = {k: len(v) for k, v in citations.items()}
total_citations = sum(citation_counts.values())
# Identify most cited references
all_citations = []
for citation_type, matches in citations.items():
for citation in matches:
all_citations.append((citation_type, citation))
# Count occurrences of each citation
citation_frequency = defaultdict(int)
for citation in all_citations:
citation_frequency[citation] += 1
# Get top citations
top_citations = sorted(citation_frequency.items(), key=lambda x: x[1], reverse=True)[:5]
return {
"citation_counts": citation_counts,
"total_citations": total_citations,
"top_citations": top_citations,
"all_citations": dict(citations), # Convert to regular dict for JSON serialization
}
GitHub Events
Total
- Issues event: 1
- Watch event: 3
- Push event: 11
- Create event: 5
Last Year
- Issues event: 1
- Watch event: 3
- Push event: 11
- Create event: 5
Dependencies
requirements.txt
pypi
- altair *
- groq *
- langchain *
- langchain-community *
- matplotlib *
- networkx *
- pandas *
- pinecone-client *
- pypdf *
- python-dotenv *
- sentence-transformers *
- streamlit *