Source code for autorag.data.utils.util

import mimetypes
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Callable

import pandas as pd
import yaml
from langchain_core.documents import Document
from llama_index.core.schema import NodeRelationship

from autorag.schema import Module
from autorag.utils.util import make_combinations, explode


[docs] def get_file_metadata(file_path: str) -> Dict: """Get some handy metadate from filesystem. Args: file_path: str: file path in str """ return { "file_path": file_path, "file_name": os.path.basename(file_path), "file_type": mimetypes.guess_type(file_path)[0], "file_size": os.path.getsize(file_path), "creation_datetime": datetime.fromtimestamp( Path(file_path).stat().st_ctime ).strftime("%Y-%m-%d"), "last_modified_datetime": datetime.fromtimestamp( Path(file_path).stat().st_mtime ).strftime("%Y-%m-%d"), "last_accessed_datetime": datetime.fromtimestamp( Path(file_path).stat().st_atime ).strftime("%Y-%m-%d"), }
[docs] def add_essential_metadata(metadata: Dict) -> Dict: if "last_modified_datetime" not in metadata: metadata["last_modified_datetime"] = datetime.now() return metadata
[docs] def corpus_df_to_langchain_documents(corpus_df: pd.DataFrame) -> List[Document]: page_contents = corpus_df["contents"].tolist() ids = corpus_df["doc_id"].tolist() metadatas = corpus_df["metadata"].tolist() return list( map( lambda x: Document(page_content=x[0], metadata={"filename": x[1], **x[2]}), zip(page_contents, ids, metadatas), ) )
[docs] def add_essential_metadata_llama_text_node(metadata: Dict, relationships: Dict) -> Dict: if "last_modified_datetime" not in metadata: metadata["last_modified_datetime"] = datetime.now() if "prev_id" not in metadata: if NodeRelationship.PREVIOUS in relationships: prev_node = relationships.get(NodeRelationship.PREVIOUS, None) if prev_node: metadata["prev_id"] = prev_node.node_id if "next_id" not in metadata: if NodeRelationship.NEXT in relationships: next_node = relationships.get(NodeRelationship.NEXT, None) if next_node: metadata["next_id"] = next_node.node_id return metadata
[docs] def load_yaml(yaml_path: str): if not os.path.exists(yaml_path): raise ValueError(f"YAML file {yaml_path} does not exist.") with open(yaml_path, "r", encoding="utf-8") as stream: try: yaml_dict = yaml.safe_load(stream) except yaml.YAMLError as exc: raise ValueError(f"YAML file {yaml_path} could not be loaded.") from exc return yaml_dict["modules"]
[docs] def get_param_combinations(modules: List[Dict]) -> Tuple[List[Callable], List[Dict]]: module_callable_list, module_params_list = [], [] for module in modules: module_instance = Module.from_dict(module) module_params_list.append(module_instance.module_param) module_callable_list.append(module_instance.module) combinations = list(map(make_combinations, module_params_list)) module_list, combination_list = explode(module_callable_list, combinations) return module_list, combination_list
[docs] def get_start_end_idx(original_text: str, search_str: str) -> Tuple[int, int]: start_idx = original_text.find(search_str) if start_idx == -1: return 0, 0 end_idx = start_idx + len(search_str) return start_idx, end_idx - 1