1 ChromaDB服务器模式 #
!pip install chromadb==0.4.20
import numpy as np
import pandas as pd
#download and unzip the dataset from kaggle:
#https://www.kaggle.com/datasets/kotartemiy/topic-labeled-news-dataset
#Pass the directory where the .csv file is stored to read_csv
news = pd.read_csv('./kaggle/labelled_newscatcher_dataset.csv', sep=';')
MAX_NEWS = 1000
DOCUMENT="title"
TOPIC="topic"
#Because it is just a example we select a small portion of News.
subset_news = news.head(MAX_NEWS)
import chromadb
chroma_client = chromadb.PersistentClient(path="./chromadb")
collection_name = "local_news_collection"
if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:
chroma_client.delete_collection(name=collection_name)
collection = chroma_client.create_collection(name=collection_name)
collection.add(
documents=subset_news[DOCUMENT].tolist(),
metadatas=[{TOPIC: topic} for topic in subset_news[TOPIC].tolist()],
ids=[f"id{x}" for x in range(MAX_NEWS)],
)
results = collection.query(query_texts=["laptop"], n_results=10 )
print(results)
{'ids': [['id173', 'id829', 'id117', 'id535', 'id141', 'id218', 'id390', 'id273', 'id56', 'id900']], 'distances': [[0.8593592047691345, 1.0294400453567505, 1.0793328285217285, 1.093001365661621, 1.1329681873321533, 1.2130439281463623, 1.2143322229385376, 1.2164145708084106, 1.222063660621643, 1.275417447090149]], 'metadatas': [[{'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}]], 'embeddings': None, 'documents': [['The Legendary Toshiba is Officially Done With Making Laptops', '3 gaming laptop deals you can’t afford to miss today', 'Lenovo and HP control half of the global laptop market', 'Asus ROG Zephyrus G14 gaming laptop announced in India', 'Acer Swift 3 featuring a 10th-generation Intel Ice Lake CPU, 2K screen, and more launched in India for INR 64999 (US$865)', "Apple's Next MacBook Could Be the Cheapest in Company's History", "Features of Huawei's Desktop Computer Revealed", 'Redmi to launch its first gaming laptop on August 14: Here are all the details', 'Toshiba shuts the lid on laptops after 35 years', 'This is the cheapest Windows PC by a mile and it even has a spare SSD slot']], 'uris': None, 'data': None}
#Running Chroma in Server Mode
!chroma run --path ./chromadb
((((((((( (((((#### ((((((((((((((((((((((######### ((((((((((((((((((((((((########### ((((((((((((((((((((((((((############ (((((((((((((((((((((((((((############# (((((((((((((((((((((((((((############# (((((((((((((((((((((((((############## ((((((((((((((((((((((((############## (((((((((((((((((((((############# ((((((((((((((((############## ((((((((( ######### Running Chroma Saving data to: ./chromadb Connect to chroma at: http://localhost:8000 Getting started guide: https://docs.trychroma.com/getting-started INFO: [24-12-2023 16:53:25] Set chroma_server_nofile to 65535 INFO: [24-12-2023 16:53:25] Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information. DEBUG: [24-12-2023 16:53:25] Starting component System DEBUG: [24-12-2023 16:53:25] Starting component OpenTelemetryClient DEBUG: [24-12-2023 16:53:25] Starting component SimpleAssignmentPolicy DEBUG: [24-12-2023 16:53:25] Starting component SqliteDB DEBUG: [24-12-2023 16:53:25] Starting component Posthog DEBUG: [24-12-2023 16:53:25] Starting component LocalSegmentManager DEBUG: [24-12-2023 16:53:25] Starting component SegmentAPI INFO: [24-12-2023 16:53:25] Started server process [7527] INFO: [24-12-2023 16:53:25] Waiting for application startup. INFO: [24-12-2023 16:53:25] Application startup complete. INFO: [24-12-2023 16:53:25] Uvicorn running on http://localhost:8000 (Press CTRL+C to quit) INFO: [24-12-2023 16:57:57] ::1:52448 - "GET /api/v1/tenants/default_tenant HTTP/1.1" 200 INFO: [24-12-2023 16:57:57] ::1:52448 - "GET /api/v1/databases/default_database?tenant=default_tenant HTTP/1.1" 200 INFO: [24-12-2023 17:06:21] ::1:52463 - "GET /api/v1/collections/local_news_collection?tenant=default_tenant&database=default_database HTTP/1.1" 200 DEBUG: [24-12-2023 17:06:22] Starting component PersistentLocalHnswSegment INFO: [24-12-2023 17:06:22] ::1:52463 - "POST /api/v1/collections/a37539aa-239b-44f5-8079-c36926d21419/query HTTP/1.1" 200 ^C INFO: [24-12-2023 17:56:12] Shutting down INFO: [24-12-2023 17:56:12] Waiting for application shutdown. INFO: [24-12-2023 17:56:12] Application shutdown complete. INFO: [24-12-2023 17:56:12] Finished server process [7527]
2 ChromaDB Client #
import chromadb
client = chromadb.HttpClient(host='localhost', port=8000)
collection_local = client.get_collection(name="local_news_collection")
results = collection_local.query(query_texts=["laptop"], n_results=10 )
print (results)
{'ids': [['id173', 'id829', 'id117', 'id535', 'id141', 'id218', 'id390', 'id273', 'id56', 'id900']], 'distances': [[0.8593592047691345, 1.0294400453567505, 1.0793328285217285, 1.093001365661621, 1.1329681873321533, 1.2130439281463623, 1.2143322229385376, 1.2164145708084106, 1.222063660621643, 1.275417447090149]], 'embeddings': None, 'metadatas': [[{'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}, {'topic': 'TECHNOLOGY'}]], 'documents': [['The Legendary Toshiba is Officially Done With Making Laptops', '3 gaming laptop deals you can’t afford to miss today', 'Lenovo and HP control half of the global laptop market', 'Asus ROG Zephyrus G14 gaming laptop announced in India', 'Acer Swift 3 featuring a 10th-generation Intel Ice Lake CPU, 2K screen, and more launched in India for INR 64999 (US$865)', "Apple's Next MacBook Could Be the Cheapest in Company's History", "Features of Huawei's Desktop Computer Revealed", 'Redmi to launch its first gaming laptop on August 14: Here are all the details', 'Toshiba shuts the lid on laptops after 35 years', 'This is the cheapest Windows PC by a mile and it even has a spare SSD slot']], 'uris': None, 'data': None}