Jupyter Notebook Code Walkthrough Day One¶

Global imports. Establish base URL Allrecipies queries.

In [118]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from fractions import Fraction

# URL of the Allrecipes search page
BASE_URL = "https://www.allrecipes.com/search?q="

Use Allrecipies query to get list of URLs containing recipies.

In [84]:
def get_recipe_list(search_term):
  links = []

  for page in range(0, 240, 24):
    url = f"{BASE_URL}{search_term}&offset={page}"

    response = requests.get(url)
    if response.status_code == 200:
      soup = BeautifulSoup(response.text, "html.parser")

      recipe_cards = soup.select("a.mntl-card-list-card--extendable")

      for card in recipe_cards:
        link = card["href"] if "href" in card.attrs else "No link"
        links.append(link)

    else:
      print(f"Failed to fetch the page. Status code: {response.status_code}")

  return links

Iterate through list of URLs from above function. Download each webpage to temp folder.

In [107]:
def download_recipes(links, dir_title):
  os.makedirs(dir_title, exist_ok=True)

  for index, link in enumerate(links):
    response = requests.get(link)

    if response.status_code == 200:
      fmt_index = str(index).zfill(3)
      file_name = f"{fmt_index}.html"
      file_path = os.path.join(dir_title, file_name)

      with open(file_path, 'w', encoding="utf-8") as f:
        f.write(response.text)

    else:
      print(f"Failed to fetch the page. Status code: {response.status_code}")

Utility function for converting unicode fraction characters to decimals.

In [143]:
def unicode_to_float(input_str):
  if input_str == '':
    return ''
  input_str = input_str.replace('½', '1/2').replace('⅓', '1/3').replace('⅔', '2/3').replace('¼', '1/4').replace('¾', '3/4').replace('⅕', '1/5').replace('⅖', '2/5').replace('⅗', '3/5').replace('⅘', '4/5').replace('⅙', '1/6').replace('⅚', '5/6').replace('⅐', '1/7').replace('⅛', '1/8').replace('⅜', '3/8').replace('⅝', '5/8').replace('⅞', '7/8').replace('⅑', '1/9').replace('⅒', '1/10')
  if ' ' in input_str:
    parts = input_str.split()
    if len(parts) == 2:
      try:
        whole = int(parts[0])
        frac = Fraction(parts[1])
      except ValueError:
        whole = 0
        frac = 0
      input_str = str(whole + frac)
  return float(Fraction(input_str))

Save a CSV file containing ingredients, units, amounts.

In [131]:
def ingredient_list(directory):
  df = pd.DataFrame(columns=["ingredient_name", "ingredient_unit", "ingredient_quantity"])
  for filename in os.listdir(directory):
    if filename.endswith(".html"):
      with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
        html_contents = f.read()
        soup = BeautifulSoup(html_contents, "html.parser")

      ingredients_list = soup.select("li.mm-recipes-structured-ingredients__list-item ")

      for ingredient in ingredients_list:
        ingredient_name = ingredient.select_one("span[data-ingredient-name='true']").text.strip() if ingredient.select_one("span[data-ingredient-name='true']") else "No ingredient name"
        ingredient_unit = ingredient.select_one("span[data-ingredient-unit='true']").text.strip() if ingredient.select_one("span[data-ingredient-unit='true']") else "No ingredient unit"
        ingredient_quantity = unicode_to_float(ingredient.select_one("span[data-ingredient-quantity='true']").text.strip()) if ingredient.select_one("span[data-ingredient-quantity='true']") else "No ingredient quantity"
        df.loc[len(df)] = [ingredient_name, ingredient_unit, ingredient_quantity]
  df.to_csv("ingredients.csv", index=False)

The "main" function. Combines all above functions.

In [108]:
def overarching(term):
  dir = f"tmp_{term}"
  links = get_recipe_list(term)
  print("Recipe list retrieved")

  download_recipes(links, dir)
  print("Recipes downloaded")

  ingredient_list(dir)
  print("Ingredients list created")
In [120]:
overarching("ramen")