In [1]:
!pip install datasets
!pip install --upgrade transformers
!pip install --upgrade torch safetensors
!pip install --upgrade transformers --pre
!pip install --upgrade accelerate

from datasets import load_dataset
ds = load_dataset("MuskumPillerum/General-Knowledge")
ds.save_to_disk("General-Knowledge")


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting aiohttp
  Downloading aiohttp-3.11.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 KB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec[http]<=2024.12.0,>=2023.1.0
  Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 KB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x8

README.md:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

output.json:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37635 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/37635 [00:00<?, ? examples/s]

In [2]:
# Step 0: Install necessary dependencies (if not already installed)
!pip install datasets transformers

# -----------------------------------------------------
# Step 1: Import required libraries
# -----------------------------------------------------
import torch
from datasets import load_from_disk, DatasetDict
from transformers import (
    T5ForConditionalGeneration,    # Pretrained T5 model for conditional generation
    T5TokenizerFast,               # Pretrained T5 tokenizer (fast version)
    Trainer, TrainingArguments     # Hugging Face Trainer API and training configuration
)

# -----------------------------------------------------
# Step 2: Load your dataset from disk
# -----------------------------------------------------
# Your dataset is assumed to be stored in "/content/General-Knowledge"
dataset = load_from_disk("General-Knowledge")
print("Dataset loaded:")
print(dataset)

# -----------------------------------------------------
# Step 3: Load a pretrained T5 tokenizer
# -----------------------------------------------------
# We load the T5 tokenizer from the Hugging Face hub (e.g., "t5-base")
tokenizer = T5TokenizerFast.from_pretrained("t5-base")
print("Pretrained tokenizer loaded with vocab size:", tokenizer.vocab_size)

# -----------------------------------------------------
# Step 4: Split the dataset into training and validation sets
# -----------------------------------------------------
# Here we perform a 90-10 split on the original dataset.
train_valid = dataset["train"].train_test_split(test_size=0.1)
dataset = DatasetDict({
    "train": train_valid["train"],
    "validation": train_valid["test"]
})

# -----------------------------------------------------
# Step 5: Define a tokenization function that explicitly casts inputs to strings
# -----------------------------------------------------
def tokenize_function(examples):
    # Convert every entry in 'Question' and 'Answer' to a string.
    questions = [str(q) for q in examples["Question"]]
    answers = [str(a) for a in examples["Answer"]]
    
    # Tokenize the questions (input texts)
    inputs = tokenizer(
        questions,
        max_length=64,          # Maximum input length (adjust as needed)
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize the answers (target texts)
    targets = tokenizer(
        answers,
        max_length=64,          # Maximum target length (adjust as needed)
        truncation=True,
        padding="max_length"
    )
    
    # Set the tokenized answer as labels for the model's loss computation.
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the tokenization function in batched mode to the entire dataset.
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("Tokenization complete.")

# -----------------------------------------------------
# Step 6: Load a pretrained T5 model for conditional generation
# -----------------------------------------------------
# We load the "t5-base" model from Hugging Face.
# model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base", low_cpu_mem_usage=True)


# T5 requires a decoder start token. We set it to the tokenizer's pad token.
if model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = tokenizer.pad_token_id
print("Model loaded.")

# -----------------------------------------------------
# Step 7: Define training arguments for fine-tuning
# -----------------------------------------------------
training_args = TrainingArguments(
    output_dir="./results",             # Directory to save checkpoints and logs
    num_train_epochs=3,                 # Number of training epochs
    per_device_train_batch_size=16,     # Batch size per device during training
    per_device_eval_batch_size=16,      # Batch size during evaluation
    learning_rate=5e-4,                 # Initial learning rate
    weight_decay=0.01,                  # Weight decay for regularization
    # evaluation_strategy="steps",        # Evaluate every few steps
    logging_steps=50,                   # Log training metrics every 50 steps
    save_steps=500,                     # Save checkpoint every 500 steps
    lr_scheduler_type="linear",         # Use a linear learning rate scheduler
    warmup_steps=100                    # Warmup steps for the scheduler
)

# -----------------------------------------------------
# Step 8: Initialize the Trainer and fine-tune the model
# -----------------------------------------------------
trainer = Trainer(
    model=model,                         # The T5 model to fine-tune
    args=training_args,                  # Training configuration defined above
    train_dataset=tokenized_dataset["train"],    # Training dataset
    eval_dataset=tokenized_dataset["validation"],  # Validation dataset
    tokenizer=tokenizer                  # Pretrained tokenizer for consistent tokenization
)

print("Starting training...")
trainer.train()
print("Training complete.")

# -----------------------------------------------------
# Step 9: Save the fine-tuned model and tokenizer
# -----------------------------------------------------
model.save_pretrained("my_saved_model")
tokenizer.save_pretrained("my_saved_model")
print("Model and tokenizer saved to 'my_saved_model'.")

# -----------------------------------------------------
# Step 10: Inference - Load the saved model and tokenizer, then generate text
# -----------------------------------------------------
# Load the model and tokenizer for inference
model_infer = T5ForConditionalGeneration.from_pretrained("my_saved_model")
tokenizer_infer = T5TokenizerFast.from_pretrained("my_saved_model")

# Set the model to evaluation mode
model_infer.eval()

# Define a sample input text
input_text = "What is Artificial Intelligence?"

# Tokenize the input text
input_ids = tokenizer_infer(input_text, return_tensors="pt").input_ids

# Generate output using beam search for improved quality
outputs = model_infer.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    early_stopping=True
)

# Decode the generated tokens into a human-readable string
generated_text = tokenizer_infer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:", generated_text)


Dataset loaded:
DatasetDict({
    train: Dataset({
        features: ['Answer', 'Question'],
        num_rows: 37635
    })
})


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Pretrained tokenizer loaded with vocab size: 32100


Map:   0%|          | 0/33871 [00:00<?, ? examples/s]

Map:   0%|          | 0/3764 [00:00<?, ? examples/s]

Tokenization complete.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded.


  trainer = Trainer(


Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,4.0054
100,1.8178
150,1.7246
200,1.7022
250,1.705
300,1.6632
350,1.6323
400,1.668
450,1.6335
500,1.5508


Training complete.
Model and tokenizer saved to 'my_saved_model'.
Generated text: Artificial Intelligence (AI) is a branch of computer science that focuses on the development and use of computer systems that are capable of performing tasks that would normally require human intelligence, such as problem-solving, decision-making,


In [3]:
# -----------------------------------------------------
# Step 10: Inference - Load the saved model and tokenizer, then generate text
# -----------------------------------------------------
# Load the model and tokenizer for inference
model_infer = T5ForConditionalGeneration.from_pretrained("my_saved_model")
tokenizer_infer = T5TokenizerFast.from_pretrained("my_saved_model")

# Set the model to evaluation mode
model_infer.eval()

# Define a sample input text
input_text = "Tell me about RAG implementation method"

# Tokenize the input text
input_ids = tokenizer_infer(input_text, return_tensors="pt").input_ids

# Generate output using beam search for improved quality
outputs = model_infer.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    early_stopping=True
)

# Decode the generated tokens into a human-readable string
generated_text = tokenizer_infer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:", generated_text)

Generated text: RAG implementation method is a type of artificial intelligence (AI) implementation method that is used to generate natural language output. It is a type of artificial intelligence (AI) implementation method that is used to generate natural language output. It is


In [4]:
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
# -----------------------------------------------------
# Step 0: Install required packages (if not already installed)
# -----------------------------------------------------
# Uncomment and run these commands in your environment if needed:
# !pip install transformers faiss-cpu datasets

import json
import torch
import numpy as np
import faiss

# -----------------------------------------------------
# Step 1: Load the Salman Khan passages from the JSON file
# -----------------------------------------------------
# Assuming the JSON file is in the current directory as "salman_khan_passages.json"
with open("combined_celebrities_passages.json", "r") as f:
    passages_data = json.load(f)

# Extract the passage texts from the JSON data
passage_texts = [entry["text"] for entry in passages_data]
print(f"Loaded {len(passage_texts)} passages.")

# -----------------------------------------------------
# Step 2: Build an index using DPR Context Encoder
# -----------------------------------------------------
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Load the DPR context encoder and its tokenizer for passage embeddings
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Compute embeddings for each passage and store them in a list
passage_embeddings = []
for text in passage_texts:
    # Tokenize the passage text
    inputs = ctx_tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    # Compute the embedding (pooler_output gives a fixed-size vector)
    with torch.no_grad():
        embedding = ctx_encoder(**inputs).pooler_output  # shape: (1, hidden_dim)
    passage_embeddings.append(embedding.squeeze(0).numpy())

# Convert list of embeddings to a NumPy array
passage_embeddings = np.vstack(passage_embeddings)  # shape: (num_passages, hidden_dim)
embedding_dim = passage_embeddings.shape[1]
print(f"Computed embeddings with dimension: {embedding_dim}")

# Create a FAISS index for efficient similarity search (L2 distance)
index = faiss.IndexFlatL2(embedding_dim)
index.add(passage_embeddings)
print(f"FAISS index built with {index.ntotal} passages.")

# -----------------------------------------------------
# Step 3: Define a retrieval function using a DPR Question Encoder
# -----------------------------------------------------
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# Load the DPR question encoder and its tokenizer to encode queries
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

def retrieve_passages(query, top_k=3):
    """
    Given a query string, encode it using the DPR question encoder,
    search the FAISS index, and return the top_k passages as raw text.
    """
    # Tokenize and encode the query
    inputs = question_tokenizer(query, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        query_embedding = question_encoder(**inputs).pooler_output  # shape: (1, hidden_dim)
    
    # Convert the query embedding to NumPy and reshape for FAISS
    query_embedding_np = query_embedding.squeeze(0).numpy().reshape(1, -1)
    distances, indices = index.search(query_embedding_np, top_k)
    
    # Retrieve the corresponding raw texts for the top_k passages
    retrieved_texts = [passage_texts[idx] for idx in indices[0]]
    return retrieved_texts

# -----------------------------------------------------
# Step 4: Load your fine-tuned T5 model and tokenizer for generation
# -----------------------------------------------------
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load the fine-tuned T5 model saved in "my_saved_model"
t5_model = T5ForConditionalGeneration.from_pretrained("my_saved_model")
t5_tokenizer = T5TokenizerFast.from_pretrained("my_saved_model")
t5_model.eval()
print("Fine-tuned T5 model loaded for generation.")

# -----------------------------------------------------
# Step 5: Define a function to generate an answer using retrieved passages
# -----------------------------------------------------
def generate_answer(query, top_k=3, max_length=100):
    """
    Given a query, retrieve top_k relevant passages,
    concatenate them with the query, and generate an answer using T5.
    """
    # Retrieve passages using the DPR-based retriever
    retrieved_passages = retrieve_passages(query, top_k=top_k)
    # Concatenate retrieved passages to form the context
    context = " ".join(retrieved_passages)
    # Construct an input prompt that combines the question and the context
    input_prompt = f"question: {query} context: {context}"
    
    # Tokenize the combined prompt with the T5 tokenizer
    input_ids = t5_tokenizer(input_prompt, return_tensors="pt").input_ids
    # Generate answer using beam search for improved quality
    outputs = t5_model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    # Decode the generated token IDs back to a string
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# -----------------------------------------------------
# Step 6: Example usage - Query the pipeline
# -----------------------------------------------------
if __name__ == "__main__":
    # Example query related to Salman Khan (e.g., his early life)
    query = "who is nikki ?"
    answer = generate_answer(query, top_k=3, max_length=100)
    print("Query:", query)
    print("Answer:", answer)


Loaded 31 passages.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Computed embeddings with dimension: 768
FAISS index built with 31 passages.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fine-tuned T5 model loaded for generation.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Query: who is nikki benz?
Answer: Ariella Ferrera was born on December 11, 1981, in Mariupol, Ukraine, and raised in Toronto, Canada. She became a contract star for major studios like Digital Playground and was inducted into the AVN Hall of Fame.


In [None]:
implementation with normal modle and with rag

In [None]:
# -----------------------------------------------------
# Step 0: Install required packages (if not already installed)
# -----------------------------------------------------
# Uncomment and run these commands in your environment if needed:
# !pip install transformers faiss-cpu datasets

import json
import torch
import numpy as np
import faiss

# -----------------------------------------------------
# Step 1: Load the celebrity passages from the JSON file
# -----------------------------------------------------
with open("combined_celebrities_passages.json", "r") as f:
    passages_data = json.load(f)

# Extract the passage texts from the JSON data
passage_texts = [entry["text"] for entry in passages_data]
print(f"Loaded {len(passage_texts)} passages.")

# -----------------------------------------------------
# Step 2: Build an index using DPR Context Encoder with normalized embeddings
# -----------------------------------------------------
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Load the DPR context encoder and its tokenizer for passage embeddings
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Compute and normalize embeddings for each passage
normalized_embeddings = []
for text in passage_texts:
    inputs = ctx_tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        embedding = ctx_encoder(**inputs).pooler_output  # shape: (1, hidden_dim)
    embedding = embedding.squeeze(0).numpy()
    norm = np.linalg.norm(embedding)
    if norm != 0:
        embedding = embedding / norm
    normalized_embeddings.append(embedding)

# Convert list of normalized embeddings to a NumPy array
normalized_embeddings = np.vstack(normalized_embeddings)  # shape: (num_passages, hidden_dim)
embedding_dim = normalized_embeddings.shape[1]
print(f"Computed normalized embeddings with dimension: {embedding_dim}")

# Create a FAISS index using inner product (IP)
index = faiss.IndexFlatIP(embedding_dim)
index.add(normalized_embeddings)
print(f"FAISS index built with {index.ntotal} passages.")

# -----------------------------------------------------
# Step 3: Define a retrieval function using a DPR Question Encoder with normalized query embedding
# -----------------------------------------------------
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# Load the DPR question encoder and its tokenizer to encode queries
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

def retrieve_passages(query, top_k=3):
    """
    Given a query string, encode it using the DPR question encoder,
    normalize the query embedding, search the FAISS index, and return
    the top_k passages as raw text along with their similarity scores.
    """
    inputs = question_tokenizer(query, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        query_embedding = question_encoder(**inputs).pooler_output  # shape: (1, hidden_dim)
    
    query_embedding = query_embedding.squeeze(0).numpy()
    norm = np.linalg.norm(query_embedding)
    if norm != 0:
        query_embedding = query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1)
    
    # Search using inner product similarity (cosine similarity with normalized vectors)
    similarities, indices = index.search(query_embedding, top_k)
    
    # Retrieve the corresponding raw texts for the top_k passages
    retrieved_texts = [passage_texts[idx] for idx in indices[0]]
    return retrieved_texts, similarities

# -----------------------------------------------------
# Step 4: Load your fine-tuned T5 model and tokenizer for generation
# -----------------------------------------------------
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load the fine-tuned T5 model saved in "my_saved_model"
t5_model = T5ForConditionalGeneration.from_pretrained("my_saved_model")
t5_tokenizer = T5TokenizerFast.from_pretrained("my_saved_model")
t5_model.eval()
print("Fine-tuned T5 model loaded for generation.")

# -----------------------------------------------------
# Step 5: Define a function to generate an answer using retrieved passages (or bypass retrieval if out-of-domain)
# -----------------------------------------------------
def generate_answer(query, top_k=3, max_length=100, similarity_threshold=0.5):
    """
    Given a query, determine if it is in-domain using keyword filtering.
    If it is in-domain (celebrity-related), attempt to retrieve top_k relevant passages.
    Otherwise, bypass retrieval and let T5 answer using its internal knowledge.
    """
    # Define a set of keywords indicating celebrity domain
    celebrity_keywords = {"salman", "celebrity", "actor", "film", "movie", "khan", "bollywood"}
    
    # Check if query contains any celebrity keyword (case-insensitive)
    in_domain = any(keyword in query.lower() for keyword in celebrity_keywords)
    
    if in_domain:
        # Attempt retrieval
        retrieved_passages, similarities = retrieve_passages(query, top_k=top_k)
        best_similarity = similarities[0][0]
        # Optionally, you can also use the similarity threshold here if needed.
        if best_similarity < similarity_threshold:
            print(f"Retrieval similarity ({best_similarity:.2f}) below threshold; bypassing retrieval.")
            input_prompt = f"question: {query}"
        else:
            context = " ".join(retrieved_passages)
            input_prompt = f"question: {query} context: {context}"
    else:
        # Out-of-domain: bypass retrieval entirely
        print("Query deemed out-of-domain; bypassing retrieval.")
        input_prompt = f"question: {query}"
    
    input_ids = t5_tokenizer(input_prompt, return_tensors="pt").input_ids
    outputs = t5_model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# -----------------------------------------------------
# Step 6: Example usage - Query the pipeline
# -----------------------------------------------------
if __name__ == "__main__":
    # Test with an out-of-domain question (should bypass retrieval)
    query = "what is artificial intelligence?"
    answer = generate_answer(query, top_k=3, max_length=100, similarity_threshold=0.5)
    print("Query:", query)
    print("Answer:", answer)
    
    # Test with an in-domain question related to a celebrity (e.g., Salman Khan)
    query2 = "tell me about salman khan's early life"
    answer2 = generate_answer(query2, top_k=3, max_length=100, similarity_threshold=0.5)
    print("Query:", query2)
    print("Answer:", answer2)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Loaded 31 passages.


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


completed our implementation

In [None]:
website with url

In [None]:
!pip install transformers faiss-cpu datasets requests beautifulsoup4 torch numpy


In [6]:
# -----------------------------------------------------
# Step 0: Install required packages (if not already installed)
# -----------------------------------------------------
# Uncomment and run these commands in your environment if needed:
# !pip install transformers faiss-cpu datasets requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import json
import torch
import numpy as np
import faiss

# -----------------------------------------------------
# Step 1: Scrape a website to extract its content as passages
# -----------------------------------------------------
def scrape_website(url, chunk_size=200):
    """
    Given a URL, scrape the webpage content, extract paragraph text,
    and split it into chunks (passages) of approximately chunk_size words.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = " ".join([para.get_text(separator=" ", strip=True) for para in paragraphs])
    
    # Split the full text into words and then into chunks
    words = full_text.split()
    passages = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        passages.append(chunk)
    return passages

# Example website link (replace with your desired URL)
website_url = "https://www.mujjumujahid.com/"
passage_texts = scrape_website(website_url)

print(f"Scraped and split website content into {len(passage_texts)} passages.")


# Print out the scraped passages to confirm the content
print(f"Scraped and split website content into {len(passage_texts)} passages.\n")
for i, passage in enumerate(passage_texts):
    print(f"Passage {i+1}:")
    print(passage)
    print("-" * 80)

# -----------------------------------------------------
# Step 2: Build an index using DPR Context Encoder with normalized embeddings
# -----------------------------------------------------
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Load the DPR context encoder and its tokenizer for passage embeddings
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Compute and normalize embeddings for each passage
passage_embeddings = []
for text in passage_texts:
    inputs = ctx_tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        embedding = ctx_encoder(**inputs).pooler_output  # shape: (1, hidden_dim)
    vec = embedding.squeeze(0).numpy()
    # Normalize the embedding
    norm = np.linalg.norm(vec)
    if norm > 0:
        vec = vec / norm
    passage_embeddings.append(vec)

# Convert list of embeddings to a NumPy array
passage_embeddings = np.vstack(passage_embeddings)  # shape: (num_passages, hidden_dim)
embedding_dim = passage_embeddings.shape[1]
print(f"Computed normalized embeddings with dimension: {embedding_dim}")

# Create a FAISS index using inner product (for cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)
index.add(passage_embeddings)
print(f"FAISS index built with {index.ntotal} passages.")

# -----------------------------------------------------
# Step 3: Define a retrieval function using a DPR Question Encoder with normalized query embedding
# -----------------------------------------------------
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# Load the DPR question encoder and its tokenizer
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

def retrieve_passages(query, top_k=3):
    """
    Encode the query with the DPR question encoder (normalized) and search the FAISS index.
    Returns the top_k passages as raw text.
    """
    inputs = question_tokenizer(query, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        query_embedding = question_encoder(**inputs).pooler_output  # shape: (1, hidden_dim)
    
    query_vec = query_embedding.squeeze(0).numpy()
    norm = np.linalg.norm(query_vec)
    if norm > 0:
        query_vec = query_vec / norm
    query_vec = query_vec.reshape(1, -1)
    
    # Search using inner product similarity (which is cosine similarity for normalized vectors)
    similarities, indices = index.search(query_vec, top_k)
    retrieved_texts = [passage_texts[idx] for idx in indices[0]]
    return retrieved_texts

# -----------------------------------------------------
# Step 4: Load your fine-tuned T5 model and tokenizer for generation
# -----------------------------------------------------
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load your fine-tuned T5 model (ensure "my_saved_model" contains your model files)
t5_model = T5ForConditionalGeneration.from_pretrained("my_saved_model")
t5_tokenizer = T5TokenizerFast.from_pretrained("my_saved_model")
t5_model.eval()
print("Fine-tuned T5 model loaded for generation.")

# -----------------------------------------------------
# Step 5: Define a function to generate an answer using retrieved passages
# -----------------------------------------------------
def generate_answer(query, top_k=3, max_length=150):
    """
    Retrieve top_k passages from the website content, format the prompt clearly,
    and generate an answer using the fine-tuned T5 model.
    """
    retrieved_passages = retrieve_passages(query, top_k=top_k)
    context = " ".join(retrieved_passages)
    # Improved prompt formatting with clear sections
    input_prompt = f"Question: {query}\n\nContext: {context}\n\nAnswer:"
    
    input_ids = t5_tokenizer(input_prompt, return_tensors="pt").input_ids
    outputs = t5_model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# -----------------------------------------------------
# Step 6: Example usage - Query the pipeline
# -----------------------------------------------------
if __name__ == "__main__":
    # Replace the query with one that is relevant to the website's content.
    query = "who is mohammad mujahid what does he do?"
    answer = generate_answer(query, top_k=3, max_length=150)
    print("Query:", query)
    print("Answer:", answer)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Scraped and split website content into 10 passages.
Scraped and split website content into 10 passages.

Passage 1:
Full Stack Expert As a full stack expert, cloud specialist, and Artificial Intelligence innovator, I integrate advanced front-end design with robust back-end systems deployed on scalable cloud architectures. I harness artificial intelligence to optimize processes and create dynamic applications. all while driving entrepreneurial ventures focused on solving mental health challenges using Artificial Intelligence and cognitive processes. My Company: Visit our site My Resume React Redux TypeScript AJAX Next.js HTML5 CSS3 JavaScript Bootstrap Figma Vue.js Angular Sass WebPack Node.js ASP.NET Python NPM PHP Java Go Ruby on Rails PostgreSQL Sequelize SQL WCF Mongodb AWS Azure Google-Cloud Alibaba-Cloud Git GitHub GitLab Bitbucket Docker Jenkins Kubernetes Bash-Script Ansible Terraform Grafana Python TensorFlow PyTorch Keras Scikit-Learn Pandas NumPy Jupyter OpenCV NLP Below is a

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Computed normalized embeddings with dimension: 768
FAISS index built with 10 passages.


Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Fine-tuned T5 model loaded for generation.
Query: who is mohammad mujahid what does he do?
Answer: "Mujjus AI Image Downloader is a powerful and intuitive web application designed to provide users with a seamless and intuitive user experience. It features a user-friendly interface, allowing users to quickly and easily download and view a wide range of images and videos. The app also provides
