Implement faster recipe lookup without browser
Do resource lookup and run regular refetch-code after wiki search.
This commit is contained in:
parent
82e7f26c89
commit
ba4aa26be3
|
@ -5,15 +5,34 @@ from typing import Optional
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import sqlalchemy
|
|
||||||
from selenium.webdriver import Firefox
|
from selenium.webdriver import Firefox
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.firefox.options import Options
|
from selenium.webdriver.firefox.options import Options
|
||||||
from selenium.webdriver.remote.webdriver import WebDriver
|
from sqlalchemy import String, create_engine, select, ForeignKey, Select, Table, Column
|
||||||
from sqlalchemy import String, create_engine, select, ForeignKey, Select, Table, Column, delete
|
|
||||||
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, Session
|
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, Session
|
||||||
|
|
||||||
recipe_info_timeout = datetime.timedelta(days=30)
|
__debug = False
|
||||||
|
__recipe_info_timeout = datetime.timedelta(days=30)
|
||||||
|
__browser: Optional[Firefox] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_browser() -> Firefox:
|
||||||
|
global __browser, __debug
|
||||||
|
if __browser is None:
|
||||||
|
firefox_options = Options()
|
||||||
|
firefox_options.add_argument("--width=1600")
|
||||||
|
firefox_options.add_argument("--height=1015")
|
||||||
|
if not __debug:
|
||||||
|
firefox_options.add_argument("--headless")
|
||||||
|
__browser = Firefox(options=firefox_options)
|
||||||
|
__browser.implicitly_wait(5)
|
||||||
|
return __browser
|
||||||
|
|
||||||
|
|
||||||
|
def browser_cleanup():
|
||||||
|
global __browser, __debug
|
||||||
|
if not __debug and __browser is not None:
|
||||||
|
__browser.quit()
|
||||||
|
|
||||||
|
|
||||||
class Base(DeclarativeBase):
|
class Base(DeclarativeBase):
|
||||||
|
@ -103,116 +122,74 @@ class Recipe(Base):
|
||||||
return f"Recipe(id={self.id}, factory={self.factory}, ingredients={self.ingredients}, results={self.results})"
|
return f"Recipe(id={self.id}, factory={self.factory}, ingredients={self.ingredients}, results={self.results})"
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(browser: WebDriver, href: str) -> str:
|
def normalize_url(href: str) -> str:
|
||||||
return urljoin(base=browser.current_url, url=href)
|
return urljoin(base=get_browser().current_url, url=href)
|
||||||
|
|
||||||
|
|
||||||
def populate_recipes(browser: WebDriver, engine: sqlalchemy.Engine, input_resource_label: str) -> int:
|
def populate_recipes(session: Session, input_resource_label: str) -> Resource:
|
||||||
|
browser = get_browser()
|
||||||
browser.find_element(By.CSS_SELECTOR, "button[id$='tab-0']").click()
|
browser.find_element(By.CSS_SELECTOR, "button[id$='tab-0']").click()
|
||||||
recipes_html_elems = browser.find_elements(By.CSS_SELECTOR, "div[id$='tabpanel-0'] > div > div")
|
recipes_html_elems = browser.find_elements(By.CSS_SELECTOR, "div[id$='tabpanel-0'] > div > div")
|
||||||
with Session(engine, autoflush=False) as session:
|
resources: dict[str, Resource] = {}
|
||||||
for recipe_idx in range(len(recipes_html_elems)):
|
new_resources: list[Resource] = []
|
||||||
recipe_html_elem = recipes_html_elems[recipe_idx]
|
|
||||||
factory_html_elem = recipe_html_elem.find_element(By.CSS_SELECTOR, ".flex-col > span > a")
|
|
||||||
factory_label = factory_html_elem.text.strip()
|
|
||||||
factory_url = urljoin(base=browser.current_url, url=factory_html_elem.get_attribute("href"))
|
|
||||||
print("recipe", recipe_idx, "produced in:", factory_label, factory_url)
|
|
||||||
|
|
||||||
def extract_resource_flow(html_elem):
|
for recipe_idx in range(len(recipes_html_elems)):
|
||||||
resource_img = html_elem.find_element(By.TAG_NAME, "img")
|
recipe_html_elem = recipes_html_elems[recipe_idx]
|
||||||
resource_label = resource_img.get_attribute("alt").strip()
|
factory_html_elem = recipe_html_elem.find_element(By.CSS_SELECTOR, ".flex-col > span > a")
|
||||||
wiki_url = normalize_url(
|
factory_label = factory_html_elem.text.strip()
|
||||||
browser=browser,
|
factory_url = urljoin(base=browser.current_url, url=factory_html_elem.get_attribute("href"))
|
||||||
href=html_elem.find_element(By.TAG_NAME, "a").get_attribute("href"),
|
|
||||||
)
|
|
||||||
resource = Resource(label=resource_label, wiki_url=wiki_url)
|
|
||||||
amount = html_elem.find_element(By.CSS_SELECTOR, ".text-xs:nth-child(2)").text.strip()
|
|
||||||
time = html_elem.find_element(By.CSS_SELECTOR, ".text-xs:nth-child(3)").text.strip()
|
|
||||||
return ResourceFlow(resource=resource, amount=amount, time=time)
|
|
||||||
|
|
||||||
ingredient_html_elems = recipe_html_elem.find_elements(
|
def extract_resource_flow(html_elem):
|
||||||
By.CSS_SELECTOR, f".flex-row > div:nth-child(1) > div:has(> a)"
|
resource_img = html_elem.find_element(By.TAG_NAME, "img")
|
||||||
)
|
resource_label = resource_img.get_attribute("alt").strip()
|
||||||
ingredients: list[ResourceFlow] = []
|
assert resource_label, "resource label is missing"
|
||||||
for ingredient_idx in range(len(ingredient_html_elems)):
|
if resource_label in resources:
|
||||||
resource_flow = extract_resource_flow(ingredient_html_elems[ingredient_idx])
|
resource = resources[resource_label]
|
||||||
ingredients.append(resource_flow)
|
else:
|
||||||
print(
|
resource = session.scalars(Resource.by_label(resource_label)).one_or_none()
|
||||||
"recipe",
|
if not resource:
|
||||||
recipe_idx,
|
wiki_url = normalize_url(
|
||||||
"ingredient",
|
href=html_elem.find_element(By.TAG_NAME, "a").get_attribute("href"),
|
||||||
ingredient_idx,
|
)
|
||||||
"name:",
|
resource = Resource(label=resource_label, wiki_url=wiki_url)
|
||||||
resource_flow.resource.label,
|
new_resources.append(resource)
|
||||||
)
|
resources[resource_label] = resource
|
||||||
print(
|
amount = html_elem.find_element(By.CSS_SELECTOR, ".text-xs:nth-child(2)").text.strip()
|
||||||
"recipe",
|
time = html_elem.find_element(By.CSS_SELECTOR, ".text-xs:nth-child(3)").text.strip()
|
||||||
recipe_idx,
|
return ResourceFlow(resource=resource, amount=amount, time=time)
|
||||||
"ingredient",
|
|
||||||
ingredient_idx,
|
|
||||||
"count:",
|
|
||||||
resource_flow.amount,
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"recipe",
|
|
||||||
recipe_idx,
|
|
||||||
"ingredient",
|
|
||||||
ingredient_idx,
|
|
||||||
"time:",
|
|
||||||
resource_flow.time,
|
|
||||||
)
|
|
||||||
result_html_elems = recipe_html_elem.find_elements(
|
|
||||||
By.CSS_SELECTOR, f".flex-row > div:nth-child(3) > div:has(> a)"
|
|
||||||
)
|
|
||||||
results: list[ResourceFlow] = []
|
|
||||||
for result_idx in range(len(result_html_elems)):
|
|
||||||
resource_flow = extract_resource_flow(result_html_elems[result_idx])
|
|
||||||
results.append(resource_flow)
|
|
||||||
print(
|
|
||||||
"recipe",
|
|
||||||
recipe_idx,
|
|
||||||
"result",
|
|
||||||
result_idx,
|
|
||||||
"name:",
|
|
||||||
resource_flow.resource.label,
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"recipe",
|
|
||||||
recipe_idx,
|
|
||||||
"result",
|
|
||||||
result_idx,
|
|
||||||
"count:",
|
|
||||||
resource_flow.amount,
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"recipe",
|
|
||||||
recipe_idx,
|
|
||||||
"result",
|
|
||||||
result_idx,
|
|
||||||
"time:",
|
|
||||||
resource_flow.time,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
ingredient_html_elems = recipe_html_elem.find_elements(
|
||||||
|
By.CSS_SELECTOR, f".flex-row > div:nth-child(1) > div:has(> a)"
|
||||||
|
)
|
||||||
|
ingredients: list[ResourceFlow] = []
|
||||||
|
for ingredient_idx in range(len(ingredient_html_elems)):
|
||||||
|
resource_flow = extract_resource_flow(ingredient_html_elems[ingredient_idx])
|
||||||
|
ingredients.append(resource_flow)
|
||||||
|
result_html_elems = recipe_html_elem.find_elements(
|
||||||
|
By.CSS_SELECTOR, f".flex-row > div:nth-child(3) > div:has(> a)"
|
||||||
|
)
|
||||||
|
results: list[ResourceFlow] = []
|
||||||
|
for result_idx in range(len(result_html_elems)):
|
||||||
|
resource_flow = extract_resource_flow(result_html_elems[result_idx])
|
||||||
|
results.append(resource_flow)
|
||||||
|
|
||||||
|
with session.no_autoflush:
|
||||||
|
# re-use existing Factory or create new
|
||||||
factory = session.scalars(Factory.by_label(factory_label)).one_or_none()
|
factory = session.scalars(Factory.by_label(factory_label)).one_or_none()
|
||||||
if not factory:
|
if not factory:
|
||||||
factory = Factory(label=factory_label, wiki_url=factory_url)
|
factory = Factory(label=factory_label, wiki_url=factory_url)
|
||||||
session.add(factory)
|
session.add(factory)
|
||||||
for flow in ingredients + results:
|
|
||||||
res = session.scalars(Resource.by_label(flow.resource.label)).one_or_none()
|
|
||||||
if res:
|
|
||||||
flow.resource = res
|
|
||||||
else:
|
|
||||||
session.add(flow.resource)
|
|
||||||
session.add(flow)
|
|
||||||
recipe = Recipe(factory=factory, ingredients=ingredients, results=results)
|
|
||||||
session.add(recipe)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
updated_resource = session.scalars(Resource.by_label(input_resource_label)).one()
|
session.add_all(new_resources)
|
||||||
updated_resource.recipes_populated_at = datetime.datetime.utcnow()
|
session.add_all(ingredients)
|
||||||
res_id = updated_resource.id
|
session.add_all(results)
|
||||||
session.commit()
|
session.add(Recipe(factory=factory, ingredients=ingredients, results=results))
|
||||||
return res_id
|
session.flush()
|
||||||
|
|
||||||
|
updated_resource = session.scalars(Resource.by_label(input_resource_label)).one()
|
||||||
|
updated_resource.recipes_populated_at = datetime.datetime.utcnow()
|
||||||
|
session.flush()
|
||||||
|
return updated_resource
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
|
@ -221,105 +198,109 @@ def populate_recipes(browser: WebDriver, engine: sqlalchemy.Engine, input_resour
|
||||||
@click.option("--refetch", is_flag=True)
|
@click.option("--refetch", is_flag=True)
|
||||||
@click.argument("search")
|
@click.argument("search")
|
||||||
def main(result: bool, debug: bool, refetch: bool, search: str):
|
def main(result: bool, debug: bool, refetch: bool, search: str):
|
||||||
|
global __debug
|
||||||
|
__debug = debug
|
||||||
engine = create_engine("sqlite:///file.db", echo=debug)
|
engine = create_engine("sqlite:///file.db", echo=debug)
|
||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
if result and search:
|
if result and search:
|
||||||
wiki_search = True
|
wiki_search = True
|
||||||
resource_label = search
|
resource: Optional[Resource] = None
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
resources = session.scalars(Resource.by_label(resource_label)).all()
|
matching_resources = session.scalars(Resource.by_label(search)).all()
|
||||||
if len(resources) == 0:
|
if len(matching_resources) == 0:
|
||||||
print("Could not find existing resources matching the search string.. starting wiki search")
|
print("Could not find existing resources matching the search string.. starting wiki search")
|
||||||
else:
|
else:
|
||||||
for idx in range(1, len(resources) + 1):
|
for idx in range(1, len(matching_resources) + 1):
|
||||||
print(f"{idx}: {resources[idx - 1].label}")
|
print(f"{idx}: {matching_resources[idx - 1].label}")
|
||||||
user_choice = click.prompt(
|
user_choice = click.prompt(
|
||||||
"Chose a resource to continue or 0 to continue with a wiki search", default=1
|
"Chose a resource to continue or 0 to continue with a wiki search", default=1
|
||||||
)
|
)
|
||||||
if user_choice != 0:
|
if user_choice != 0:
|
||||||
res_id = resources[user_choice - 1].id
|
resource = matching_resources[user_choice - 1]
|
||||||
resource_label = resources[user_choice - 1].label
|
|
||||||
wiki_search = False
|
wiki_search = False
|
||||||
session.commit()
|
|
||||||
|
|
||||||
firefox_options = Options()
|
try:
|
||||||
firefox_options.add_argument("--width=1600")
|
if wiki_search:
|
||||||
firefox_options.add_argument("--height=1015")
|
browser = get_browser()
|
||||||
if not debug:
|
browser.get("https://wiki.kyrium.space/")
|
||||||
firefox_options.add_argument("--headless")
|
search_bar = browser.find_element(By.CSS_SELECTOR, "nav input[placeholder='Search for an item...']")
|
||||||
browser = Firefox(options=firefox_options)
|
search_bar.click()
|
||||||
browser.implicitly_wait(5)
|
search_bar.send_keys(search)
|
||||||
try:
|
search_button = browser.find_element(By.CSS_SELECTOR, "nav button[type='submit']")
|
||||||
if wiki_search:
|
search_button.click()
|
||||||
browser.get("https://wiki.kyrium.space/")
|
choices = browser.find_elements(
|
||||||
search_bar = browser.find_element(By.CSS_SELECTOR, "nav input[placeholder='Search for an item...']")
|
By.CSS_SELECTOR, "body > div > .container:nth-child(1) a.items-center"
|
||||||
search_bar.click()
|
|
||||||
search_bar.send_keys(search)
|
|
||||||
search_button = browser.find_element(By.CSS_SELECTOR, "nav button[type='submit']")
|
|
||||||
search_button.click()
|
|
||||||
choices = browser.find_elements(By.CSS_SELECTOR, "body > div > .container:nth-child(1) a.items-center")
|
|
||||||
if not choices:
|
|
||||||
print("No wiki entries found for this result")
|
|
||||||
return
|
|
||||||
elif len(choices) > 1:
|
|
||||||
default_choice = 1
|
|
||||||
choice_names: list[str] = []
|
|
||||||
for choice_idx in range(1, len(choices) + 1):
|
|
||||||
recipe_choice = choices[choice_idx - 1]
|
|
||||||
name = recipe_choice.find_element(By.TAG_NAME, "img").get_attribute("alt")
|
|
||||||
choice_names.append(name)
|
|
||||||
if name.casefold() == search.casefold():
|
|
||||||
default_choice = choice_idx
|
|
||||||
print(f"{choice_idx}: {name}")
|
|
||||||
user_choice = click.prompt("Chose a recipe to continue…", default=default_choice)
|
|
||||||
if not user_choice:
|
|
||||||
user_choice = default_choice
|
|
||||||
else:
|
|
||||||
user_choice = int(user_choice)
|
|
||||||
|
|
||||||
link_html_elem = choices[user_choice - 1]
|
|
||||||
else:
|
|
||||||
link_html_elem = choices[0]
|
|
||||||
|
|
||||||
resource_label = link_html_elem.find_element(By.TAG_NAME, "img").get_attribute("alt")
|
|
||||||
# FIXME: check if resource_label is in database
|
|
||||||
if debug:
|
|
||||||
print("resource_label:", resource_label)
|
|
||||||
link_html_elem.click()
|
|
||||||
res_id = populate_recipes(browser=browser, engine=engine, input_resource_label=resource_label)
|
|
||||||
else:
|
|
||||||
with Session(engine) as session:
|
|
||||||
input_resource = session.get(Resource, res_id)
|
|
||||||
input_resource_url = input_resource.wiki_url
|
|
||||||
resource_label = input_resource.label
|
|
||||||
refetch = (
|
|
||||||
refetch
|
|
||||||
or input_resource.recipes_populated_at is None
|
|
||||||
or datetime.datetime.utcnow() - input_resource.recipes_populated_at > recipe_info_timeout
|
|
||||||
)
|
)
|
||||||
if refetch:
|
if not choices:
|
||||||
print("Deleting recipes for", input_resource)
|
print("No wiki entries found for this result")
|
||||||
for flow in session.scalars(select(ResourceFlow).where(ResourceFlow.resource_id == res_id)):
|
return
|
||||||
|
elif len(choices) > 1:
|
||||||
|
default_choice = 1
|
||||||
|
choice_names: list[str] = []
|
||||||
|
for choice_idx in range(1, len(choices) + 1):
|
||||||
|
recipe_choice = choices[choice_idx - 1]
|
||||||
|
name = recipe_choice.find_element(By.TAG_NAME, "img").get_attribute("alt")
|
||||||
|
choice_names.append(name)
|
||||||
|
if name.casefold() == search.casefold():
|
||||||
|
default_choice = choice_idx
|
||||||
|
print(f"{choice_idx}: {name}")
|
||||||
|
user_choice = click.prompt("Chose a recipe to continue…", default=default_choice)
|
||||||
|
if not user_choice:
|
||||||
|
user_choice = default_choice
|
||||||
|
else:
|
||||||
|
user_choice = int(user_choice)
|
||||||
|
|
||||||
|
link_html_elem = choices[user_choice - 1]
|
||||||
|
else:
|
||||||
|
link_html_elem = choices[0]
|
||||||
|
|
||||||
|
alt_resource_label = link_html_elem.find_element(By.TAG_NAME, "img").get_attribute("alt")
|
||||||
|
resource = session.scalars(Resource.by_label(alt_resource_label)).one_or_none()
|
||||||
|
if not resource:
|
||||||
|
resource_fetch_url = normalize_url(href=link_html_elem.get_attribute("href"))
|
||||||
|
|
||||||
|
refetch = (
|
||||||
|
refetch
|
||||||
|
or resource is None
|
||||||
|
or resource.recipes_populated_at is None
|
||||||
|
or datetime.datetime.utcnow() - resource.recipes_populated_at > __recipe_info_timeout
|
||||||
|
)
|
||||||
|
if refetch and resource is not None:
|
||||||
|
print("Deleting recipes for", resource.label)
|
||||||
|
with session.begin_nested():
|
||||||
|
for flow in session.scalars(
|
||||||
|
select(ResourceFlow).where(ResourceFlow.resource_id == resource.id)
|
||||||
|
):
|
||||||
if flow.result_of:
|
if flow.result_of:
|
||||||
for flow2 in flow.result_of.ingredients:
|
for flow2 in flow.result_of.ingredients:
|
||||||
session.delete(flow2)
|
session.delete(flow2)
|
||||||
for flow2 in flow.result_of.results:
|
for flow2 in flow.result_of.results:
|
||||||
session.delete(flow2)
|
session.delete(flow2)
|
||||||
session.delete(flow.result_of)
|
session.delete(flow.result_of)
|
||||||
session.commit()
|
|
||||||
if refetch:
|
|
||||||
print("Refetching recipes for", resource_label)
|
|
||||||
browser.get(input_resource_url)
|
|
||||||
res_id = populate_recipes(browser=browser, engine=engine, input_resource_label=resource_label)
|
|
||||||
|
|
||||||
with Session(engine) as session:
|
if refetch:
|
||||||
stmt = select(Recipe).join(Recipe.results).filter(ResourceFlow.resource_id == res_id)
|
browser = get_browser()
|
||||||
|
if resource is None:
|
||||||
|
print("Fetching recipes for new resource", alt_resource_label)
|
||||||
|
assert resource_fetch_url, "Resource wiki url not set"
|
||||||
|
browser.get(resource_fetch_url)
|
||||||
|
resource_label = alt_resource_label
|
||||||
|
else:
|
||||||
|
print("Refetching recipes for", resource.label)
|
||||||
|
browser.get(resource.wiki_url)
|
||||||
|
resource_label = resource.label
|
||||||
|
|
||||||
|
with session.begin_nested():
|
||||||
|
resource = populate_recipes(session=session, input_resource_label=resource_label)
|
||||||
|
session.refresh(resource)
|
||||||
|
|
||||||
|
assert resource, "Resource must be set at this point"
|
||||||
|
stmt = select(Recipe).join(Recipe.results).filter(ResourceFlow.resource_id == resource.id)
|
||||||
for recipe in session.scalars(stmt):
|
for recipe in session.scalars(stmt):
|
||||||
print(recipe)
|
print(recipe)
|
||||||
for flow in recipe.ingredients:
|
for flow in recipe.ingredients:
|
||||||
print("ingredient:", flow.resource, flow)
|
print("ingredient:", flow.resource, flow)
|
||||||
for flow in recipe.results:
|
for flow in recipe.results:
|
||||||
print("result: ", flow.resource, flow)
|
print("result: ", flow.resource, flow)
|
||||||
finally:
|
finally:
|
||||||
if not debug:
|
browser_cleanup()
|
||||||
browser.quit()
|
|
||||||
|
|
Loading…
Reference in a new issue