25 lines
661 B
Python
25 lines
661 B
Python
from bs4 import BeautifulSoup
|
|
import chromadb
|
|
import httpx
|
|
|
|
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
|
|
|
# Scrape
|
|
BASE_URL = "https://www.vet.cornell.edu"
|
|
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
|
|
|
|
QUERY_URL = BASE_URL + LIST_URL
|
|
r = httpx.get(QUERY_URL)
|
|
soup = BeautifulSoup(r.text)
|
|
|
|
container = soup.find("div", class_="field-body")
|
|
a_s = container.find_all("a", href=True)
|
|
|
|
new_texts = []
|
|
|
|
for link in a_s:
|
|
endpoint = link["href"]
|
|
query_url = BASE_URL + endpoint
|
|
r2 = httpx.get(query_url)
|
|
article_soup = BeautifulSoup(r2.text)
|