Source code for autorag.data.parse.base

import functools
import logging
from datetime import datetime
from glob import glob
from typing import Tuple, List, Optional
import os

from autorag.utils import result_to_dataframe
from autorag.data.utils.util import get_file_metadata

logger = logging.getLogger("AutoRAG")


[docs] def parser_node(func): @functools.wraps(func) @result_to_dataframe(["texts", "path", "page", "last_modified_datetime"]) def wrapper( data_path_glob: str, file_type: str, parse_method: Optional[str] = None, **kwargs, ) -> Tuple[List[str], List[str], List[int], List[datetime]]: logger.info(f"Running parser - {func.__name__} module...") data_path_list = glob(data_path_glob) if not data_path_list: raise FileNotFoundError(f"data does not exits in {data_path_glob}") assert file_type in [ "pdf", "csv", "json", "md", "html", "xml", "all_files", ], f"search type {file_type} is not supported" # extract only files from data_path_list based on the file_type set in the YAML file data_paths = ( [ data_path for data_path in data_path_list if os.path.basename(data_path).split(".")[-1] == file_type ] if file_type != "all_files" else data_path_list ) if func.__name__ == "langchain_parse": parse_method = parse_method.lower() if parse_method == "directory": path_split_list = data_path_glob.split("/") glob_path = path_split_list.pop() folder_path = "/".join(path_split_list) kwargs.update({"glob": glob_path, "path": folder_path}) result = func( data_path_list=data_paths, parse_method=parse_method, **kwargs ) else: result = func( data_path_list=data_paths, parse_method=parse_method, **kwargs ) elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]: result = func(data_path_list=data_paths, **kwargs) else: raise ValueError(f"Unsupported module_type: {func.__name__}") result = _add_last_modified_datetime(result) return result return wrapper
def _add_last_modified_datetime(result): last_modified_datetime_lst = list( map(lambda x: get_file_metadata(x)["last_modified_datetime"], result[1]) ) result_with_dates = result + (last_modified_datetime_lst,) return result_with_dates