content_creator

Fine tuning Llama2 on specific website content

The .ipynb file will open in Collab if you would like to try it out. I have just pasted the Collab source code below for readability (i.e. the following runs in my Collab):

!pip install kaleido
!pip install langchain
!pip install huggingface_hub
!pip install sentence_transformers
!pip install faiss-cpu
!pip install unstructured
!pip install chromadb
!pip install Cython
!pip install tiktoken
!pip install unstructured[local-inference]
!pip install -qU transformers 
!pip install -qU accelerate 
!pip install -qU einops 
!pip install -qU xformers 
!pip install -qU bitsandbytes 
!pip install -qU faiss-gpu

Restart the Collab runtime. The run the following code:

# Generate and then add your HuggingFace API Token in here
YourHuggingFaceAPIToken = 'abcd'
# Imports
import os
import torch
import requests
import transformers
from torch import cuda, bfloat16
import xml.dom.minidom as minidom
from langchain.document_loaders import TextLoader 
from langchain.text_splitter import CharacterTextSplitter 
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import WebBaseLoader
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS 
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Fetch sitemap's text of each site that is to be indexed
fermyon_website_sitemap = requests.get('https://www.fermyon.com/sitemap.xml', allow_redirects=True).text
fermyon_documentation_sitemap = requests.get('https://developer.fermyon.com/sitemap.xml', allow_redirects=True).text
component_model_documentation_sitemap = requests.get('https://component-model.bytecodealliance.org/sitemap.xml', allow_redirects=True).text

# Parse each sitemap's text to obtain list of pages
parsed_fermyon_website_sitemap_document = minidom.parseString(fermyon_website_sitemap)
parsed_fermyon_documentation_sitemap_document = minidom.parseString(fermyon_documentation_sitemap)
parsed_component_model_documentation_sitemap_document = minidom.parseString(component_model_documentation_sitemap)

# Cherry pick just the loc elements from the XML
fermyon_website_sitemap_loc_elements = parsed_fermyon_website_sitemap_document.getElementsByTagName('loc')
fermyon_documentation_sitemap_loc_elements = parsed_fermyon_documentation_sitemap_document.getElementsByTagName('loc')
component_model_documentation_sitemap_loc_elements = parsed_component_model_documentation_sitemap_document.getElementsByTagName('loc')

# Declare blank lists of pages for each site
fermyon_website_page_urls = []
fermyon_documentation_page_urls = []
component_model_documentation_page_urls = []

# Iterate over loc elements (of each sitemap) and add to that site's list of pages
for fermyon_website_sitemap_loc_element in fermyon_website_sitemap_loc_elements:
    fermyon_website_page_urls.append(fermyon_website_sitemap_loc_element.toxml().removesuffix("</loc>").removeprefix("<loc>"))
for fermyon_documentation_sitemap_loc_element in fermyon_documentation_sitemap_loc_elements:
    fermyon_documentation_page_urls.append(fermyon_documentation_sitemap_loc_element.toxml().removesuffix("</loc>").removeprefix("<loc>"))
for component_model_documentation_sitemap_loc_element in component_model_documentation_sitemap_loc_elements:
    component_model_documentation_page_urls.append(component_model_documentation_sitemap_loc_element.toxml().removesuffix("</loc>").removeprefix("<loc>"))

URLs = fermyon_website_page_urls + fermyon_documentation_page_urls + component_model_documentation_page_urls

text_to_remove = "rss"
filtered_list = [item for item in URLs if text_to_remove not in item]

print("Number of page to process is {}\n First page to process is {} and the last page to process is {}".format(len(filtered_list), filtered_list[0], filtered_list[len(filtered_list) - 1]))

os.environ["HUGGINGFACEHUB_API_TOKEN"] = YourHuggingFaceAPIToken
hf_auth = YourHuggingFaceAPIToken
loader2 = [UnstructuredURLLoader(urls=filtered_list)]
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
!pip install accelerate
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])
generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
llm = HuggingFacePipeline(pipeline=generate_text)
web_links = filtered_list
loader = WebBaseLoader(web_links)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

After the above has run in the Collab, you can test by running the following code (insert your own questions that relate the the websites that were indexed above):

chat_history = []

query1 = "Can you please write me a few sentences about what Fermyon does and why a developer would want to use Fermyon Spin and Fermyon Cloud?"
result1 = chain({"question": query1, "chat_history": chat_history})

print("Tell me about Fermyon?\n")
print(result1['answer'])

tpmccallum / content_creator Goto Github PK

content_creator's Introduction

content_creator

content_creator's People

Contributors

Watchers

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent