mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-03-12 04:35:35 -07:00
feat: Open AI Recipe Scraper (#3690)
This commit is contained in:
parent
a49c32e663
commit
4afb767375
frontend
mealie
routes/recipe
schema
services
openai/prompts/recipes
parser_services/openai
scraper
tests/unit_tests
@ -583,6 +583,8 @@
|
||||
"report-deletion-failed": "Report deletion failed",
|
||||
"recipe-debugger": "Recipe Debugger",
|
||||
"recipe-debugger-description": "Grab the URL of the recipe you want to debug and paste it here. The URL will be scraped by the recipe scraper and the results will be displayed. If you don't see any data returned, the site you are trying to scrape is not supported by Mealie or its scraper library.",
|
||||
"use-openai": "Use OpenAI",
|
||||
"recipe-debugger-use-openai-description": "Use OpenAI to parse the results instead of relying on the scraper library. When creating a recipe via URL, this is done automatically if the scraper library fails, but you may test it manually here.",
|
||||
"debug": "Debug",
|
||||
"tree-view": "Tree View",
|
||||
"recipe-yield": "Recipe Yield",
|
||||
|
@ -128,8 +128,8 @@ export class RecipeAPI extends BaseCRUDAPI<CreateRecipe, Recipe, Recipe> {
|
||||
return this.requests.post<UpdateImageResponse>(routes.recipesRecipeSlugImage(slug), { url });
|
||||
}
|
||||
|
||||
async testCreateOneUrl(url: string) {
|
||||
return await this.requests.post<Recipe | null>(routes.recipesTestScrapeUrl, { url });
|
||||
async testCreateOneUrl(url: string, useOpenAI = false) {
|
||||
return await this.requests.post<Recipe | null>(routes.recipesTestScrapeUrl, { url, useOpenAI });
|
||||
}
|
||||
|
||||
async createOneByUrl(url: string, includeTags: boolean) {
|
||||
|
@ -18,7 +18,11 @@
|
||||
:rules="[validators.url]"
|
||||
:hint="$t('new-recipe.url-form-hint')"
|
||||
persistent-hint
|
||||
></v-text-field>
|
||||
/>
|
||||
</v-card-text>
|
||||
<v-card-text v-if="appInfo && appInfo.enableOpenai">
|
||||
{{ $t('recipe.recipe-debugger-use-openai-description') }}
|
||||
<v-checkbox v-model="useOpenAI" :label="$t('recipe.use-openai')"></v-checkbox>
|
||||
</v-card-text>
|
||||
<v-card-actions class="justify-center">
|
||||
<div style="width: 250px">
|
||||
@ -51,7 +55,7 @@
|
||||
|
||||
<script lang="ts">
|
||||
import { defineComponent, reactive, toRefs, ref, useRouter, computed, useRoute } from "@nuxtjs/composition-api";
|
||||
import { useUserApi } from "~/composables/api";
|
||||
import { useAppInfo, useUserApi } from "~/composables/api";
|
||||
import { validators } from "~/composables/use-validators";
|
||||
import { Recipe } from "~/lib/api/types/recipe";
|
||||
|
||||
@ -60,11 +64,13 @@ export default defineComponent({
|
||||
const state = reactive({
|
||||
error: false,
|
||||
loading: false,
|
||||
useOpenAI: false,
|
||||
});
|
||||
|
||||
const api = useUserApi();
|
||||
const route = useRoute();
|
||||
const router = useRouter();
|
||||
const appInfo = useAppInfo();
|
||||
|
||||
const recipeUrl = computed({
|
||||
set(recipe_import_url: string | null) {
|
||||
@ -89,13 +95,14 @@ export default defineComponent({
|
||||
|
||||
state.loading = true;
|
||||
|
||||
const { data } = await api.recipes.testCreateOneUrl(url);
|
||||
const { data } = await api.recipes.testCreateOneUrl(url, state.useOpenAI);
|
||||
|
||||
state.loading = false;
|
||||
debugData.value = data;
|
||||
}
|
||||
|
||||
return {
|
||||
appInfo,
|
||||
recipeUrl,
|
||||
debugTreeView,
|
||||
debugUrl,
|
||||
|
@ -5,7 +5,17 @@ from zipfile import ZipFile
|
||||
|
||||
import orjson
|
||||
import sqlalchemy
|
||||
from fastapi import BackgroundTasks, Depends, File, Form, HTTPException, Path, Query, Request, status
|
||||
from fastapi import (
|
||||
BackgroundTasks,
|
||||
Depends,
|
||||
File,
|
||||
Form,
|
||||
HTTPException,
|
||||
Path,
|
||||
Query,
|
||||
Request,
|
||||
status,
|
||||
)
|
||||
from fastapi.datastructures import UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import UUID4, BaseModel, Field
|
||||
@ -14,7 +24,11 @@ from starlette.background import BackgroundTask
|
||||
from starlette.responses import FileResponse
|
||||
|
||||
from mealie.core import exceptions
|
||||
from mealie.core.dependencies import get_temporary_path, get_temporary_zip_path, validate_recipe_token
|
||||
from mealie.core.dependencies import (
|
||||
get_temporary_path,
|
||||
get_temporary_zip_path,
|
||||
validate_recipe_token,
|
||||
)
|
||||
from mealie.core.security import create_recipe_slug_token
|
||||
from mealie.db.models.group.cookbook import CookBook
|
||||
from mealie.pkgs import cache
|
||||
@ -26,10 +40,19 @@ from mealie.routes._base.routers import MealieCrudRoute, UserAPIRouter
|
||||
from mealie.schema.cookbook.cookbook import ReadCookBook
|
||||
from mealie.schema.make_dependable import make_dependable
|
||||
from mealie.schema.recipe import Recipe, RecipeImageTypes, ScrapeRecipe
|
||||
from mealie.schema.recipe.recipe import CreateRecipe, CreateRecipeByUrlBulk, RecipeLastMade, RecipeSummary
|
||||
from mealie.schema.recipe.recipe import (
|
||||
CreateRecipe,
|
||||
CreateRecipeByUrlBulk,
|
||||
RecipeLastMade,
|
||||
RecipeSummary,
|
||||
)
|
||||
from mealie.schema.recipe.recipe_asset import RecipeAsset
|
||||
from mealie.schema.recipe.recipe_scraper import ScrapeRecipeTest
|
||||
from mealie.schema.recipe.request_helpers import RecipeDuplicate, RecipeZipTokenResponse, UpdateImageResponse
|
||||
from mealie.schema.recipe.request_helpers import (
|
||||
RecipeDuplicate,
|
||||
RecipeZipTokenResponse,
|
||||
UpdateImageResponse,
|
||||
)
|
||||
from mealie.schema.response import PaginationBase, PaginationQuery
|
||||
from mealie.schema.response.pagination import RecipeSearchQuery
|
||||
from mealie.schema.response.responses import ErrorResponse
|
||||
@ -40,13 +63,21 @@ from mealie.services.event_bus_service.event_types import (
|
||||
EventRecipeData,
|
||||
EventTypes,
|
||||
)
|
||||
from mealie.services.recipe.recipe_data_service import InvalidDomainError, NotAnImageError, RecipeDataService
|
||||
from mealie.services.recipe.recipe_data_service import (
|
||||
InvalidDomainError,
|
||||
NotAnImageError,
|
||||
RecipeDataService,
|
||||
)
|
||||
from mealie.services.recipe.recipe_service import RecipeService
|
||||
from mealie.services.recipe.template_service import TemplateService
|
||||
from mealie.services.scraper.recipe_bulk_scraper import RecipeBulkScraperService
|
||||
from mealie.services.scraper.scraped_extras import ScraperContext
|
||||
from mealie.services.scraper.scraper import create_from_url
|
||||
from mealie.services.scraper.scraper_strategies import ForceTimeoutException, RecipeScraperPackage
|
||||
from mealie.services.scraper.scraper_strategies import (
|
||||
ForceTimeoutException,
|
||||
RecipeScraperOpenAI,
|
||||
RecipeScraperPackage,
|
||||
)
|
||||
|
||||
|
||||
class JSONBytes(JSONResponse):
|
||||
@ -210,10 +241,11 @@ class RecipeController(BaseRecipeController):
|
||||
return {"reportId": report_id}
|
||||
|
||||
@router.post("/test-scrape-url")
|
||||
async def test_parse_recipe_url(self, url: ScrapeRecipeTest):
|
||||
async def test_parse_recipe_url(self, data: ScrapeRecipeTest):
|
||||
# Debugger should produce the same result as the scraper sees before cleaning
|
||||
ScraperClass = RecipeScraperOpenAI if data.use_openai else RecipeScraperPackage
|
||||
try:
|
||||
if scraped_data := await RecipeScraperPackage(url.url, self.translator).scrape_url():
|
||||
if scraped_data := await ScraperClass(data.url, self.translator).scrape_url():
|
||||
return scraped_data.schema.data
|
||||
except ForceTimeoutException as e:
|
||||
raise HTTPException(
|
||||
|
0
mealie/schema/openai/__init__.py
Normal file
0
mealie/schema/openai/__init__.py
Normal file
10
mealie/schema/openai/_base.py
Normal file
10
mealie/schema/openai/_base.py
Normal file
@ -0,0 +1,10 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class OpenAIBase(BaseModel):
|
||||
"""
|
||||
This class defines the JSON schema sent to OpenAI. Its schema is
|
||||
injected directly into the OpenAI prompt.
|
||||
"""
|
||||
|
||||
__doc__ = "" # we don't want to include the docstring in the JSON schema
|
92
mealie/schema/openai/recipe_ingredient.py
Normal file
92
mealie/schema/openai/recipe_ingredient.py
Normal file
@ -0,0 +1,92 @@
|
||||
from textwrap import dedent
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
|
||||
from ._base import OpenAIBase
|
||||
|
||||
|
||||
class OpenAIIngredient(OpenAIBase):
|
||||
input: str = Field(
|
||||
...,
|
||||
description=dedent(
|
||||
"""
|
||||
The input is simply the ingredient string you are processing as-is. It is forbidden to
|
||||
modify this at all, you must provide the input exactly as you received it.
|
||||
"""
|
||||
),
|
||||
)
|
||||
confidence: float | None = Field(
|
||||
None,
|
||||
description=dedent(
|
||||
"""
|
||||
This value is a float between 0 - 100, where 100 is full confidence that the result is correct,
|
||||
and 0 is no confidence that the result is correct. If you're unable to parse anything,
|
||||
and you put the entire string in the notes, you should return 0 confidence. If you can easily
|
||||
parse the string into each component, then you should return a confidence of 100. If you have to
|
||||
guess which part is the unit and which part is the food, your confidence should be lower, such as 60.
|
||||
Even if there is no unit or note, if you're able to determine the food, you may use a higher confidence.
|
||||
If the entire ingredient consists of only a food, you can use a confidence of 100.
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
quantity: float | None = Field(
|
||||
0,
|
||||
description=dedent(
|
||||
"""
|
||||
The numerical representation of how much of this ingredient. For instance, if you receive
|
||||
"3 1/2 grams of minced garlic", the quantity is "3 1/2". Quantity may be represented as a whole number
|
||||
(integer), a float or decimal, or a fraction. You should output quantity in only whole numbers or
|
||||
floats, converting fractions into floats. Floats longer than 10 decimal places should be
|
||||
rounded to 10 decimal places.
|
||||
"""
|
||||
),
|
||||
)
|
||||
unit: str | None = Field(
|
||||
None,
|
||||
description=dedent(
|
||||
"""
|
||||
The unit of measurement for this ingredient. For instance, if you receive
|
||||
"2 lbs chicken breast", the unit is "lbs" (short for "pounds").
|
||||
"""
|
||||
),
|
||||
)
|
||||
food: str | None = Field(
|
||||
None,
|
||||
description=dedent(
|
||||
"""
|
||||
The actual physical ingredient used in the recipe. For instance, if you receive
|
||||
"3 cups of onions, chopped", the food is "onions".
|
||||
"""
|
||||
),
|
||||
)
|
||||
note: str | None = Field(
|
||||
None,
|
||||
description=dedent(
|
||||
"""
|
||||
The rest of the text that represents more detail on how to prepare the ingredient.
|
||||
Anything that is not one of the above should be the note. For instance, if you receive
|
||||
"one can of butter beans, drained" the note would be "drained". If you receive
|
||||
"3 cloves of garlic peeled and finely chopped", the note would be "peeled and finely chopped".
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
@field_validator("quantity")
|
||||
def coerce_none_qty(cls, v: float | None) -> float:
|
||||
return v or 0
|
||||
|
||||
@field_validator("confidence")
|
||||
def validate_confidence(cls, v: float | None) -> float:
|
||||
v = v or 0
|
||||
|
||||
if v < 0:
|
||||
v = 0
|
||||
elif v > 100:
|
||||
v = 100
|
||||
|
||||
return v / 100
|
||||
|
||||
|
||||
class OpenAIIngredients(OpenAIBase):
|
||||
ingredients: list[OpenAIIngredient] = []
|
@ -1,10 +1,11 @@
|
||||
from pydantic import ConfigDict
|
||||
from pydantic import ConfigDict, Field
|
||||
|
||||
from mealie.schema._mealie.mealie_model import MealieModel
|
||||
|
||||
|
||||
class ScrapeRecipeTest(MealieModel):
|
||||
url: str
|
||||
use_openai: bool = Field(False, alias="useOpenAI")
|
||||
|
||||
|
||||
class ScrapeRecipe(MealieModel):
|
||||
|
@ -1,11 +1,4 @@
|
||||
You are a bot that parses user input into recipe ingredients. You will receive a list of one or more ingredients, each containing one or more of the following components:
|
||||
- Food: the actual physical ingredient used in the recipe. For instance, if you receive "3 cups of onions, chopped", the food is "onions"
|
||||
- Unit: the unit of measurement for this ingredient. For instance, if you receive "2 lbs chicken breast", the unit is "lbs" (short for "pounds")
|
||||
- Quantity: the numerical representation of how much of this ingredient. For instance, if you receive "3 1/2 grams of minced garlic", the quantity is "3 1/2". Quantity may be represented as a whole number (integer), a float or decimal, or a fraction. You should output quantity in only whole numbers or floats, converting fractions into floats. Floats longer than 10 decimal places should be rounded to 10 decimal places.
|
||||
- Note: the rest of the text that represents more detail on how to prepare the ingredient. Anything that is not one of the above should be the note. For instance, if you receive "one can of butter beans, drained" the note would be "drained". If you receive "3 cloves of garlic peeled and finely chopped", the note would be "peeled and finely chopped"
|
||||
- Input: The input is simply the ingredient string you are processing as-is. It is forbidden to modify this at all, you must provide the input exactly as you received it
|
||||
|
||||
While parsing the ingredients, there are some things to keep in mind:
|
||||
You are a bot that parses user input into recipe ingredients. You will receive a list of one or more ingredients, each containing one or more of the following components: quantity, unit, food, and note. Their definitions are stated in the JSON schema below. While parsing the ingredients, there are some things to keep in mind:
|
||||
- If you cannot accurately determine the quantity, unit, food, or note, you should place everything into the note field and leave everything else empty. It's better to err on the side of putting everything in the note field than being wrong
|
||||
- You may receive recipe ingredients from multiple different languages. You should adhere to the grammar rules of the input language when trying to parse the ingredient string
|
||||
- Sometimes foods or units will be in their singular, plural, or other grammatical forms. You must interpret all of them appropriately
|
||||
@ -17,8 +10,6 @@ While parsing the ingredients, there are some things to keep in mind:
|
||||
|
||||
It is imperative that you do not create any data or otherwise make up any information. Failure to adhere to this rule is illegal and will result in harsh punishment. If you are unsure, place the entire string into the note section of the response. Do not make things up.
|
||||
|
||||
In addition to calculating the recipe ingredient fields, you are also responsible for including a confidence value. This value is a float between 0 - 1, where 1 is full confidence that the result is correct, and 0 is no confidence that the result is correct. If you're unable to parse anything, and you put the entire string in the notes, you should return 0 confidence. If you can easily parse the string into each component, then you should return a confidence of 1. If you have to guess which part is the unit and which part is the food, your confidence should be lower, such as 0.6. Even if there is no unit or note, if you're able to determine the food, you may use a higher confidence. If the entire ingredient consists of only a food, you can use a confidence of 1.
|
||||
|
||||
Below you will receive the JSON schema for your response. Your response must be in valid JSON in the below schema as provided. You must respond in this JSON schema; failure to do so is illegal. It is imperative that you follow the schema precisely to avoid punishment. You must follow the JSON schema.
|
||||
|
||||
The user message that you receive will be the list of one or more recipe ingredients for you to parse. Your response should have exactly one item for each item provided. For instance, if you receive 12 items to parse, then your response should be an array of 12 parsed items.
|
||||
|
7
mealie/services/openai/prompts/recipes/scrape-recipe.txt
Normal file
7
mealie/services/openai/prompts/recipes/scrape-recipe.txt
Normal file
@ -0,0 +1,7 @@
|
||||
You are a bot that reads website data and parses it into recipe JSON. You will receive the contents of a webpage (such as its HTML) and you need to extract the recipe data and return its JSON in valid schema. The recipe schema is the standard schema.org schema, which is defined at "https://schema.org/Recipe".
|
||||
|
||||
It is imperative that you do not create any data or otherwise make up any information. Failure to adhere to this rule is illegal and will result in harsh punishment. If you are unable to extract data due to insufficient input, you may reply with a completely empty JSON object (represented by two brackets: {}).
|
||||
|
||||
Your response must be in valid JSON in the schema.org Recipe definition. You must respond in this JSON schema; failure to do so is illegal. It is imperative that you follow the schema precisely to avoid punishment. You must follow the JSON schema.
|
||||
|
||||
The user message that you receive will be the webpage contents, including (but not necessarily limited to) text extracted from the HTML.
|
@ -2,8 +2,7 @@ import asyncio
|
||||
import json
|
||||
from collections.abc import Awaitable
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from mealie.schema.openai.recipe_ingredient import OpenAIIngredient, OpenAIIngredients
|
||||
from mealie.schema.recipe.recipe_ingredient import (
|
||||
CreateIngredientFood,
|
||||
CreateIngredientUnit,
|
||||
@ -16,27 +15,6 @@ from mealie.services.openai import OpenAIDataInjection, OpenAIService
|
||||
from .._base import ABCIngredientParser
|
||||
|
||||
|
||||
class OpenAIIngredient(BaseModel):
|
||||
"""
|
||||
This class defines the JSON schema sent to OpenAI. Its schema is
|
||||
injected directly into the OpenAI prompt.
|
||||
"""
|
||||
|
||||
__doc__ = "" # we don't want to include the docstring in the JSON schema
|
||||
|
||||
input: str
|
||||
confidence: float | None = None
|
||||
|
||||
quantity: float | None = 0
|
||||
unit: str | None = None
|
||||
food: str | None = None
|
||||
note: str | None = None
|
||||
|
||||
|
||||
class OpenAIIngredients(BaseModel):
|
||||
ingredients: list[OpenAIIngredient] = []
|
||||
|
||||
|
||||
class OpenAIParser(ABCIngredientParser):
|
||||
def _convert_ingredient(self, openai_ing: OpenAIIngredient) -> ParsedIngredient:
|
||||
ingredient = RecipeIngredient(
|
||||
|
@ -2,9 +2,19 @@ from mealie.lang.providers import Translator
|
||||
from mealie.schema.recipe.recipe import Recipe
|
||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
from .scraper_strategies import ABCScraperStrategy, RecipeScraperOpenGraph, RecipeScraperPackage
|
||||
from .scraper_strategies import (
|
||||
ABCScraperStrategy,
|
||||
RecipeScraperOpenAI,
|
||||
RecipeScraperOpenGraph,
|
||||
RecipeScraperPackage,
|
||||
safe_scrape_html,
|
||||
)
|
||||
|
||||
DEFAULT_SCRAPER_STRATEGIES: list[type[ABCScraperStrategy]] = [RecipeScraperPackage, RecipeScraperOpenGraph]
|
||||
DEFAULT_SCRAPER_STRATEGIES: list[type[ABCScraperStrategy]] = [
|
||||
RecipeScraperPackage,
|
||||
RecipeScraperOpenAI,
|
||||
RecipeScraperOpenGraph,
|
||||
]
|
||||
|
||||
|
||||
class RecipeScraper:
|
||||
@ -27,8 +37,9 @@ class RecipeScraper:
|
||||
Scrapes a recipe from the web.
|
||||
"""
|
||||
|
||||
raw_html = await safe_scrape_html(url)
|
||||
for scraper_type in self.scrapers:
|
||||
scraper = scraper_type(url, self.translator)
|
||||
scraper = scraper_type(url, self.translator, raw_html=raw_html)
|
||||
result = await scraper.parse()
|
||||
|
||||
if result is not None:
|
||||
|
@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import bs4
|
||||
import extruct
|
||||
from fastapi import HTTPException, status
|
||||
from httpx import AsyncClient
|
||||
@ -10,10 +11,12 @@ from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrap
|
||||
from slugify import slugify
|
||||
from w3lib.html import get_base_url
|
||||
|
||||
from mealie.core.config import get_app_settings
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.pkgs import safehttp
|
||||
from mealie.schema.recipe.recipe import Recipe, RecipeStep
|
||||
from mealie.services.openai import OpenAIService
|
||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
from . import cleaner
|
||||
@ -86,9 +89,15 @@ class ABCScraperStrategy(ABC):
|
||||
|
||||
url: str
|
||||
|
||||
def __init__(self, url: str, translator: Translator) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
translator: Translator,
|
||||
raw_html: str | None = None,
|
||||
) -> None:
|
||||
self.logger = get_logger()
|
||||
self.url = url
|
||||
self.raw_html = raw_html
|
||||
self.translator = translator
|
||||
|
||||
@abstractmethod
|
||||
@ -109,7 +118,7 @@ class ABCScraperStrategy(ABC):
|
||||
|
||||
class RecipeScraperPackage(ABCScraperStrategy):
|
||||
async def get_html(self, url: str) -> str:
|
||||
return await safe_scrape_html(url)
|
||||
return self.raw_html or await safe_scrape_html(url)
|
||||
|
||||
def clean_scraper(self, scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> tuple[Recipe, ScrapedExtras]:
|
||||
def try_get_default(
|
||||
@ -227,9 +236,75 @@ class RecipeScraperPackage(ABCScraperStrategy):
|
||||
return self.clean_scraper(scraped_data, self.url)
|
||||
|
||||
|
||||
class RecipeScraperOpenAI(RecipeScraperPackage):
|
||||
"""
|
||||
A wrapper around the `RecipeScraperPackage` class that uses OpenAI to extract the recipe from the URL,
|
||||
rather than trying to scrape it directly.
|
||||
"""
|
||||
|
||||
def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
|
||||
# find the open graph image tag
|
||||
og_image = soup.find("meta", property="og:image")
|
||||
if og_image and og_image.get("content"):
|
||||
return og_image["content"]
|
||||
|
||||
# find the largest image on the page
|
||||
largest_img = None
|
||||
max_size = 0
|
||||
for img in soup.find_all("img"):
|
||||
width = img.get("width", 0)
|
||||
height = img.get("height", 0)
|
||||
if not width or not height:
|
||||
continue
|
||||
|
||||
size = int(width) * int(height)
|
||||
if size > max_size:
|
||||
max_size = size
|
||||
largest_img = img
|
||||
|
||||
if largest_img:
|
||||
return largest_img.get("src")
|
||||
|
||||
return None
|
||||
|
||||
def format_html_to_text(self, html: str) -> str:
|
||||
soup = bs4.BeautifulSoup(html, "lxml")
|
||||
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
if not text:
|
||||
raise Exception("No text found in HTML")
|
||||
image = self.find_image(soup)
|
||||
|
||||
components = [f"Convert this content to JSON: {text}"]
|
||||
if image:
|
||||
components.append(f"Recipe Image: {image}")
|
||||
return "\n".join(components)
|
||||
|
||||
async def get_html(self, url: str) -> str:
|
||||
settings = get_app_settings()
|
||||
if not settings.OPENAI_ENABLED:
|
||||
return ""
|
||||
|
||||
html = self.raw_html or await safe_scrape_html(url)
|
||||
text = self.format_html_to_text(html)
|
||||
try:
|
||||
service = OpenAIService()
|
||||
prompt = service.get_prompt("recipes.scrape-recipe")
|
||||
|
||||
response_json = await service.get_response(prompt, text, force_json_response=True)
|
||||
return (
|
||||
"<!DOCTYPE html><html><head>"
|
||||
f'<script type="application/ld+json">{response_json}</script>'
|
||||
"</head><body></body></html>"
|
||||
)
|
||||
except Exception:
|
||||
self.logger.exception(f"OpenAI was unable to extract a recipe from {url}")
|
||||
return ""
|
||||
|
||||
|
||||
class RecipeScraperOpenGraph(ABCScraperStrategy):
|
||||
async def get_html(self, url: str) -> str:
|
||||
return await safe_scrape_html(url)
|
||||
return self.raw_html or await safe_scrape_html(url)
|
||||
|
||||
def get_recipe_fields(self, html) -> dict | None:
|
||||
"""
|
||||
|
@ -9,6 +9,7 @@ from pydantic import UUID4
|
||||
|
||||
from mealie.db.db_setup import session_context
|
||||
from mealie.repos.repository_factory import AllRepositories
|
||||
from mealie.schema.openai.recipe_ingredient import OpenAIIngredient, OpenAIIngredients
|
||||
from mealie.schema.recipe.recipe_ingredient import (
|
||||
CreateIngredientFood,
|
||||
CreateIngredientFoodAlias,
|
||||
@ -24,8 +25,10 @@ from mealie.schema.recipe.recipe_ingredient import (
|
||||
from mealie.schema.user.user import GroupBase
|
||||
from mealie.services.openai import OpenAIService
|
||||
from mealie.services.parser_services import RegisteredParser, get_parser
|
||||
from mealie.services.parser_services.crfpp.processor import CRFIngredient, convert_list_to_crf_model
|
||||
from mealie.services.parser_services.openai.parser import OpenAIIngredient, OpenAIIngredients
|
||||
from mealie.services.parser_services.crfpp.processor import (
|
||||
CRFIngredient,
|
||||
convert_list_to_crf_model,
|
||||
)
|
||||
from tests.utils.factories import random_int, random_string
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user