[docs]@parser_nodedeflangchain_parse(data_path_list:List[str],parse_method:str,**kwargs)->Tuple[List[str],List[str],List[int]]:""" Parse documents to use langchain document_loaders(parse) method :param data_path_list: The list of data paths to parse. :param parse_method: A langchain document_loaders(parse) method to use. :param kwargs: The extra parameters for creating the langchain document_loaders(parse) instance. :return: tuple of lists containing the parsed texts, path and pages. """ifparse_methodin["directory","unstructured"]:results=parse_all_files(data_path_list,parse_method,**kwargs)texts,path=results[0],results[1]pages=[-1]*len(texts)else:num_workers=mp.cpu_count()# Execute parallel processingwithmp.Pool(num_workers)aspool:results=pool.starmap(langchain_parse_pure,[(data_path,parse_method,kwargs)fordata_pathindata_path_list],)texts,path,pages=(list(chain.from_iterable(item))foriteminzip(*results))returntexts,path,pages
[docs]deflangchain_parse_pure(data_path:str,parse_method:str,kwargs)->Tuple[List[str],List[str],List[int]]:""" Parses a single file using the specified parse method. Args: data_path (str): The file path to parse. parse_method (str): The parsing method to use. kwargs (Dict): Additional keyword arguments for the parsing method. Returns: Tuple[str, str]: A tuple containing the parsed text and the file path. """parse_instance=parse_modules[parse_method](data_path,**kwargs)# Load the text from the filedocuments=parse_instance.load()texts=list(map(lambdax:x.page_content,documents))path=[data_path]*len(texts)ifparse_methodin["pymupdf","pdfplumber","pypdf","pypdfium2"]:pages=list(range(1,len(documents)+1))else:pages=[-1]*len(texts)# Clean up the parse instancedelparse_instancereturntexts,path,pages