refactor
This commit is contained in:
24
services/raggr/petmd_scrape_index.py
Normal file
24
services/raggr/petmd_scrape_index.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import chromadb
|
||||
import httpx
|
||||
|
||||
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
||||
|
||||
# Scrape
|
||||
BASE_URL = "https://www.vet.cornell.edu"
|
||||
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
|
||||
|
||||
QUERY_URL = BASE_URL + LIST_URL
|
||||
r = httpx.get(QUERY_URL)
|
||||
soup = BeautifulSoup(r.text)
|
||||
|
||||
container = soup.find("div", class_="field-body")
|
||||
a_s = container.find_all("a", href=True)
|
||||
|
||||
new_texts = []
|
||||
|
||||
for link in a_s:
|
||||
endpoint = link["href"]
|
||||
query_url = BASE_URL + endpoint
|
||||
r2 = httpx.get(query_url)
|
||||
article_soup = BeautifulSoup(r2.text)
|
||||
Reference in New Issue
Block a user