feat: Open AI Recipe Scraper ()

This commit is contained in:
Michael Genson 2024-06-07 06:45:50 -05:00 committed by GitHub
parent a49c32e663
commit 4afb767375
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 264 additions and 55 deletions
frontend
lang/messages
lib/api/user/recipes
pages/g/_groupSlug/r/create
mealie
tests/unit_tests

@ -583,6 +583,8 @@
"report-deletion-failed": "Report deletion failed",
"recipe-debugger": "Recipe Debugger",
"recipe-debugger-description": "Grab the URL of the recipe you want to debug and paste it here. The URL will be scraped by the recipe scraper and the results will be displayed. If you don't see any data returned, the site you are trying to scrape is not supported by Mealie or its scraper library.",
"use-openai": "Use OpenAI",
"recipe-debugger-use-openai-description": "Use OpenAI to parse the results instead of relying on the scraper library. When creating a recipe via URL, this is done automatically if the scraper library fails, but you may test it manually here.",
"debug": "Debug",
"tree-view": "Tree View",
"recipe-yield": "Recipe Yield",

@ -128,8 +128,8 @@ export class RecipeAPI extends BaseCRUDAPI<CreateRecipe, Recipe, Recipe> {
return this.requests.post<UpdateImageResponse>(routes.recipesRecipeSlugImage(slug), { url });
}
async testCreateOneUrl(url: string) {
return await this.requests.post<Recipe | null>(routes.recipesTestScrapeUrl, { url });
async testCreateOneUrl(url: string, useOpenAI = false) {
return await this.requests.post<Recipe | null>(routes.recipesTestScrapeUrl, { url, useOpenAI });
}
async createOneByUrl(url: string, includeTags: boolean) {

@ -18,7 +18,11 @@
:rules="[validators.url]"
:hint="$t('new-recipe.url-form-hint')"
persistent-hint
></v-text-field>
/>
</v-card-text>
<v-card-text v-if="appInfo && appInfo.enableOpenai">
{{ $t('recipe.recipe-debugger-use-openai-description') }}
<v-checkbox v-model="useOpenAI" :label="$t('recipe.use-openai')"></v-checkbox>
</v-card-text>
<v-card-actions class="justify-center">
<div style="width: 250px">
@ -51,7 +55,7 @@
<script lang="ts">
import { defineComponent, reactive, toRefs, ref, useRouter, computed, useRoute } from "@nuxtjs/composition-api";
import { useUserApi } from "~/composables/api";
import { useAppInfo, useUserApi } from "~/composables/api";
import { validators } from "~/composables/use-validators";
import { Recipe } from "~/lib/api/types/recipe";
@ -60,11 +64,13 @@ export default defineComponent({
const state = reactive({
error: false,
loading: false,
useOpenAI: false,
});
const api = useUserApi();
const route = useRoute();
const router = useRouter();
const appInfo = useAppInfo();
const recipeUrl = computed({
set(recipe_import_url: string | null) {
@ -89,13 +95,14 @@ export default defineComponent({
state.loading = true;
const { data } = await api.recipes.testCreateOneUrl(url);
const { data } = await api.recipes.testCreateOneUrl(url, state.useOpenAI);
state.loading = false;
debugData.value = data;
}
return {
appInfo,
recipeUrl,
debugTreeView,
debugUrl,

@ -5,7 +5,17 @@ from zipfile import ZipFile
import orjson
import sqlalchemy
from fastapi import BackgroundTasks, Depends, File, Form, HTTPException, Path, Query, Request, status
from fastapi import (
BackgroundTasks,
Depends,
File,
Form,
HTTPException,
Path,
Query,
Request,
status,
)
from fastapi.datastructures import UploadFile
from fastapi.responses import JSONResponse
from pydantic import UUID4, BaseModel, Field
@ -14,7 +24,11 @@ from starlette.background import BackgroundTask
from starlette.responses import FileResponse
from mealie.core import exceptions
from mealie.core.dependencies import get_temporary_path, get_temporary_zip_path, validate_recipe_token
from mealie.core.dependencies import (
get_temporary_path,
get_temporary_zip_path,
validate_recipe_token,
)
from mealie.core.security import create_recipe_slug_token
from mealie.db.models.group.cookbook import CookBook
from mealie.pkgs import cache
@ -26,10 +40,19 @@ from mealie.routes._base.routers import MealieCrudRoute, UserAPIRouter
from mealie.schema.cookbook.cookbook import ReadCookBook
from mealie.schema.make_dependable import make_dependable
from mealie.schema.recipe import Recipe, RecipeImageTypes, ScrapeRecipe
from mealie.schema.recipe.recipe import CreateRecipe, CreateRecipeByUrlBulk, RecipeLastMade, RecipeSummary
from mealie.schema.recipe.recipe import (
CreateRecipe,
CreateRecipeByUrlBulk,
RecipeLastMade,
RecipeSummary,
)
from mealie.schema.recipe.recipe_asset import RecipeAsset
from mealie.schema.recipe.recipe_scraper import ScrapeRecipeTest
from mealie.schema.recipe.request_helpers import RecipeDuplicate, RecipeZipTokenResponse, UpdateImageResponse
from mealie.schema.recipe.request_helpers import (
RecipeDuplicate,
RecipeZipTokenResponse,
UpdateImageResponse,
)
from mealie.schema.response import PaginationBase, PaginationQuery
from mealie.schema.response.pagination import RecipeSearchQuery
from mealie.schema.response.responses import ErrorResponse
@ -40,13 +63,21 @@ from mealie.services.event_bus_service.event_types import (
EventRecipeData,
EventTypes,
)
from mealie.services.recipe.recipe_data_service import InvalidDomainError, NotAnImageError, RecipeDataService
from mealie.services.recipe.recipe_data_service import (
InvalidDomainError,
NotAnImageError,
RecipeDataService,
)
from mealie.services.recipe.recipe_service import RecipeService
from mealie.services.recipe.template_service import TemplateService
from mealie.services.scraper.recipe_bulk_scraper import RecipeBulkScraperService
from mealie.services.scraper.scraped_extras import ScraperContext
from mealie.services.scraper.scraper import create_from_url
from mealie.services.scraper.scraper_strategies import ForceTimeoutException, RecipeScraperPackage
from mealie.services.scraper.scraper_strategies import (
ForceTimeoutException,
RecipeScraperOpenAI,
RecipeScraperPackage,
)
class JSONBytes(JSONResponse):
@ -210,10 +241,11 @@ class RecipeController(BaseRecipeController):
return {"reportId": report_id}
@router.post("/test-scrape-url")
async def test_parse_recipe_url(self, url: ScrapeRecipeTest):
async def test_parse_recipe_url(self, data: ScrapeRecipeTest):
# Debugger should produce the same result as the scraper sees before cleaning
ScraperClass = RecipeScraperOpenAI if data.use_openai else RecipeScraperPackage
try:
if scraped_data := await RecipeScraperPackage(url.url, self.translator).scrape_url():
if scraped_data := await ScraperClass(data.url, self.translator).scrape_url():
return scraped_data.schema.data
except ForceTimeoutException as e:
raise HTTPException(

@ -0,0 +1,10 @@
from pydantic import BaseModel
class OpenAIBase(BaseModel):
"""
This class defines the JSON schema sent to OpenAI. Its schema is
injected directly into the OpenAI prompt.
"""
__doc__ = "" # we don't want to include the docstring in the JSON schema

@ -0,0 +1,92 @@
from textwrap import dedent
from pydantic import Field, field_validator
from ._base import OpenAIBase
class OpenAIIngredient(OpenAIBase):
input: str = Field(
...,
description=dedent(
"""
The input is simply the ingredient string you are processing as-is. It is forbidden to
modify this at all, you must provide the input exactly as you received it.
"""
),
)
confidence: float | None = Field(
None,
description=dedent(
"""
This value is a float between 0 - 100, where 100 is full confidence that the result is correct,
and 0 is no confidence that the result is correct. If you're unable to parse anything,
and you put the entire string in the notes, you should return 0 confidence. If you can easily
parse the string into each component, then you should return a confidence of 100. If you have to
guess which part is the unit and which part is the food, your confidence should be lower, such as 60.
Even if there is no unit or note, if you're able to determine the food, you may use a higher confidence.
If the entire ingredient consists of only a food, you can use a confidence of 100.
"""
),
)
quantity: float | None = Field(
0,
description=dedent(
"""
The numerical representation of how much of this ingredient. For instance, if you receive
"3 1/2 grams of minced garlic", the quantity is "3 1/2". Quantity may be represented as a whole number
(integer), a float or decimal, or a fraction. You should output quantity in only whole numbers or
floats, converting fractions into floats. Floats longer than 10 decimal places should be
rounded to 10 decimal places.
"""
),
)
unit: str | None = Field(
None,
description=dedent(
"""
The unit of measurement for this ingredient. For instance, if you receive
"2 lbs chicken breast", the unit is "lbs" (short for "pounds").
"""
),
)
food: str | None = Field(
None,
description=dedent(
"""
The actual physical ingredient used in the recipe. For instance, if you receive
"3 cups of onions, chopped", the food is "onions".
"""
),
)
note: str | None = Field(
None,
description=dedent(
"""
The rest of the text that represents more detail on how to prepare the ingredient.
Anything that is not one of the above should be the note. For instance, if you receive
"one can of butter beans, drained" the note would be "drained". If you receive
"3 cloves of garlic peeled and finely chopped", the note would be "peeled and finely chopped".
"""
),
)
@field_validator("quantity")
def coerce_none_qty(cls, v: float | None) -> float:
return v or 0
@field_validator("confidence")
def validate_confidence(cls, v: float | None) -> float:
v = v or 0
if v < 0:
v = 0
elif v > 100:
v = 100
return v / 100
class OpenAIIngredients(OpenAIBase):
ingredients: list[OpenAIIngredient] = []

@ -1,10 +1,11 @@
from pydantic import ConfigDict
from pydantic import ConfigDict, Field
from mealie.schema._mealie.mealie_model import MealieModel
class ScrapeRecipeTest(MealieModel):
url: str
use_openai: bool = Field(False, alias="useOpenAI")
class ScrapeRecipe(MealieModel):

@ -1,11 +1,4 @@
You are a bot that parses user input into recipe ingredients. You will receive a list of one or more ingredients, each containing one or more of the following components:
- Food: the actual physical ingredient used in the recipe. For instance, if you receive "3 cups of onions, chopped", the food is "onions"
- Unit: the unit of measurement for this ingredient. For instance, if you receive "2 lbs chicken breast", the unit is "lbs" (short for "pounds")
- Quantity: the numerical representation of how much of this ingredient. For instance, if you receive "3 1/2 grams of minced garlic", the quantity is "3 1/2". Quantity may be represented as a whole number (integer), a float or decimal, or a fraction. You should output quantity in only whole numbers or floats, converting fractions into floats. Floats longer than 10 decimal places should be rounded to 10 decimal places.
- Note: the rest of the text that represents more detail on how to prepare the ingredient. Anything that is not one of the above should be the note. For instance, if you receive "one can of butter beans, drained" the note would be "drained". If you receive "3 cloves of garlic peeled and finely chopped", the note would be "peeled and finely chopped"
- Input: The input is simply the ingredient string you are processing as-is. It is forbidden to modify this at all, you must provide the input exactly as you received it
While parsing the ingredients, there are some things to keep in mind:
You are a bot that parses user input into recipe ingredients. You will receive a list of one or more ingredients, each containing one or more of the following components: quantity, unit, food, and note. Their definitions are stated in the JSON schema below. While parsing the ingredients, there are some things to keep in mind:
- If you cannot accurately determine the quantity, unit, food, or note, you should place everything into the note field and leave everything else empty. It's better to err on the side of putting everything in the note field than being wrong
- You may receive recipe ingredients from multiple different languages. You should adhere to the grammar rules of the input language when trying to parse the ingredient string
- Sometimes foods or units will be in their singular, plural, or other grammatical forms. You must interpret all of them appropriately
@ -17,8 +10,6 @@ While parsing the ingredients, there are some things to keep in mind:
It is imperative that you do not create any data or otherwise make up any information. Failure to adhere to this rule is illegal and will result in harsh punishment. If you are unsure, place the entire string into the note section of the response. Do not make things up.
In addition to calculating the recipe ingredient fields, you are also responsible for including a confidence value. This value is a float between 0 - 1, where 1 is full confidence that the result is correct, and 0 is no confidence that the result is correct. If you're unable to parse anything, and you put the entire string in the notes, you should return 0 confidence. If you can easily parse the string into each component, then you should return a confidence of 1. If you have to guess which part is the unit and which part is the food, your confidence should be lower, such as 0.6. Even if there is no unit or note, if you're able to determine the food, you may use a higher confidence. If the entire ingredient consists of only a food, you can use a confidence of 1.
Below you will receive the JSON schema for your response. Your response must be in valid JSON in the below schema as provided. You must respond in this JSON schema; failure to do so is illegal. It is imperative that you follow the schema precisely to avoid punishment. You must follow the JSON schema.
The user message that you receive will be the list of one or more recipe ingredients for you to parse. Your response should have exactly one item for each item provided. For instance, if you receive 12 items to parse, then your response should be an array of 12 parsed items.

@ -0,0 +1,7 @@
You are a bot that reads website data and parses it into recipe JSON. You will receive the contents of a webpage (such as its HTML) and you need to extract the recipe data and return its JSON in valid schema. The recipe schema is the standard schema.org schema, which is defined at "https://schema.org/Recipe".
It is imperative that you do not create any data or otherwise make up any information. Failure to adhere to this rule is illegal and will result in harsh punishment. If you are unable to extract data due to insufficient input, you may reply with a completely empty JSON object (represented by two brackets: {}).
Your response must be in valid JSON in the schema.org Recipe definition. You must respond in this JSON schema; failure to do so is illegal. It is imperative that you follow the schema precisely to avoid punishment. You must follow the JSON schema.
The user message that you receive will be the webpage contents, including (but not necessarily limited to) text extracted from the HTML.

@ -2,8 +2,7 @@ import asyncio
import json
from collections.abc import Awaitable
from pydantic import BaseModel
from mealie.schema.openai.recipe_ingredient import OpenAIIngredient, OpenAIIngredients
from mealie.schema.recipe.recipe_ingredient import (
CreateIngredientFood,
CreateIngredientUnit,
@ -16,27 +15,6 @@ from mealie.services.openai import OpenAIDataInjection, OpenAIService
from .._base import ABCIngredientParser
class OpenAIIngredient(BaseModel):
"""
This class defines the JSON schema sent to OpenAI. Its schema is
injected directly into the OpenAI prompt.
"""
__doc__ = "" # we don't want to include the docstring in the JSON schema
input: str
confidence: float | None = None
quantity: float | None = 0
unit: str | None = None
food: str | None = None
note: str | None = None
class OpenAIIngredients(BaseModel):
ingredients: list[OpenAIIngredient] = []
class OpenAIParser(ABCIngredientParser):
def _convert_ingredient(self, openai_ing: OpenAIIngredient) -> ParsedIngredient:
ingredient = RecipeIngredient(

@ -2,9 +2,19 @@ from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe
from mealie.services.scraper.scraped_extras import ScrapedExtras
from .scraper_strategies import ABCScraperStrategy, RecipeScraperOpenGraph, RecipeScraperPackage
from .scraper_strategies import (
ABCScraperStrategy,
RecipeScraperOpenAI,
RecipeScraperOpenGraph,
RecipeScraperPackage,
safe_scrape_html,
)
DEFAULT_SCRAPER_STRATEGIES: list[type[ABCScraperStrategy]] = [RecipeScraperPackage, RecipeScraperOpenGraph]
DEFAULT_SCRAPER_STRATEGIES: list[type[ABCScraperStrategy]] = [
RecipeScraperPackage,
RecipeScraperOpenAI,
RecipeScraperOpenGraph,
]
class RecipeScraper:
@ -27,8 +37,9 @@ class RecipeScraper:
Scrapes a recipe from the web.
"""
raw_html = await safe_scrape_html(url)
for scraper_type in self.scrapers:
scraper = scraper_type(url, self.translator)
scraper = scraper_type(url, self.translator, raw_html=raw_html)
result = await scraper.parse()
if result is not None:

@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
from collections.abc import Callable
from typing import Any
import bs4
import extruct
from fastapi import HTTPException, status
from httpx import AsyncClient
@ -10,10 +11,12 @@ from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrap
from slugify import slugify
from w3lib.html import get_base_url
from mealie.core.config import get_app_settings
from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.pkgs import safehttp
from mealie.schema.recipe.recipe import Recipe, RecipeStep
from mealie.services.openai import OpenAIService
from mealie.services.scraper.scraped_extras import ScrapedExtras
from . import cleaner
@ -86,9 +89,15 @@ class ABCScraperStrategy(ABC):
url: str
def __init__(self, url: str, translator: Translator) -> None:
def __init__(
self,
url: str,
translator: Translator,
raw_html: str | None = None,
) -> None:
self.logger = get_logger()
self.url = url
self.raw_html = raw_html
self.translator = translator
@abstractmethod
@ -109,7 +118,7 @@ class ABCScraperStrategy(ABC):
class RecipeScraperPackage(ABCScraperStrategy):
async def get_html(self, url: str) -> str:
return await safe_scrape_html(url)
return self.raw_html or await safe_scrape_html(url)
def clean_scraper(self, scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> tuple[Recipe, ScrapedExtras]:
def try_get_default(
@ -227,9 +236,75 @@ class RecipeScraperPackage(ABCScraperStrategy):
return self.clean_scraper(scraped_data, self.url)
class RecipeScraperOpenAI(RecipeScraperPackage):
"""
A wrapper around the `RecipeScraperPackage` class that uses OpenAI to extract the recipe from the URL,
rather than trying to scrape it directly.
"""
def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
# find the open graph image tag
og_image = soup.find("meta", property="og:image")
if og_image and og_image.get("content"):
return og_image["content"]
# find the largest image on the page
largest_img = None
max_size = 0
for img in soup.find_all("img"):
width = img.get("width", 0)
height = img.get("height", 0)
if not width or not height:
continue
size = int(width) * int(height)
if size > max_size:
max_size = size
largest_img = img
if largest_img:
return largest_img.get("src")
return None
def format_html_to_text(self, html: str) -> str:
soup = bs4.BeautifulSoup(html, "lxml")
text = soup.get_text(separator="\n", strip=True)
if not text:
raise Exception("No text found in HTML")
image = self.find_image(soup)
components = [f"Convert this content to JSON: {text}"]
if image:
components.append(f"Recipe Image: {image}")
return "\n".join(components)
async def get_html(self, url: str) -> str:
settings = get_app_settings()
if not settings.OPENAI_ENABLED:
return ""
html = self.raw_html or await safe_scrape_html(url)
text = self.format_html_to_text(html)
try:
service = OpenAIService()
prompt = service.get_prompt("recipes.scrape-recipe")
response_json = await service.get_response(prompt, text, force_json_response=True)
return (
"<!DOCTYPE html><html><head>"
f'<script type="application/ld+json">{response_json}</script>'
"</head><body></body></html>"
)
except Exception:
self.logger.exception(f"OpenAI was unable to extract a recipe from {url}")
return ""
class RecipeScraperOpenGraph(ABCScraperStrategy):
async def get_html(self, url: str) -> str:
return await safe_scrape_html(url)
return self.raw_html or await safe_scrape_html(url)
def get_recipe_fields(self, html) -> dict | None:
"""

@ -9,6 +9,7 @@ from pydantic import UUID4
from mealie.db.db_setup import session_context
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.openai.recipe_ingredient import OpenAIIngredient, OpenAIIngredients
from mealie.schema.recipe.recipe_ingredient import (
CreateIngredientFood,
CreateIngredientFoodAlias,
@ -24,8 +25,10 @@ from mealie.schema.recipe.recipe_ingredient import (
from mealie.schema.user.user import GroupBase
from mealie.services.openai import OpenAIService
from mealie.services.parser_services import RegisteredParser, get_parser
from mealie.services.parser_services.crfpp.processor import CRFIngredient, convert_list_to_crf_model
from mealie.services.parser_services.openai.parser import OpenAIIngredient, OpenAIIngredients
from mealie.services.parser_services.crfpp.processor import (
CRFIngredient,
convert_list_to_crf_model,
)
from tests.utils.factories import random_int, random_string