[docs]defparser_node(func):@functools.wraps(func)@result_to_dataframe(["texts","path","page","last_modified_datetime"])defwrapper(data_path_glob:str,file_type:str,parse_method:Optional[str]=None,**kwargs,)->Tuple[List[str],List[str],List[int],List[datetime]]:logger.info(f"Running parser - {func.__name__} module...")data_path_list=glob(data_path_glob)ifnotdata_path_list:raiseFileNotFoundError(f"data does not exits in {data_path_glob}")assertfile_typein["pdf","csv","json","md","html","xml","all_files",],f"search type {file_type} is not supported"# extract only files from data_path_list based on the file_type set in the YAML filedata_paths=([data_pathfordata_pathindata_path_listifos.path.basename(data_path).split(".")[-1]==file_type]iffile_type!="all_files"elsedata_path_list)iffunc.__name__=="langchain_parse":parse_method=parse_method.lower()ifparse_method=="directory":path_split_list=data_path_glob.split("/")glob_path=path_split_list.pop()folder_path="/".join(path_split_list)kwargs.update({"glob":glob_path,"path":folder_path})result=func(data_path_list=data_paths,parse_method=parse_method,**kwargs)else:result=func(data_path_list=data_paths,parse_method=parse_method,**kwargs)eliffunc.__name__in["clova_ocr","llama_parse","table_hybrid_parse"]:result=func(data_path_list=data_paths,**kwargs)else:raiseValueError(f"Unsupported module_type: {func.__name__}")result=_add_last_modified_datetime(result)returnresultreturnwrapper