[docs]@classmethoddeffrom_parquet(cls,parsed_data_path:str,project_dir:Optional[str]=None)->"Chunker":ifnotos.path.exists(parsed_data_path):raiseValueError(f"parsed_data_path {parsed_data_path} does not exist.")ifnotparsed_data_path.endswith("parquet"):raiseValueError(f"parsed_data_path {parsed_data_path} is not a parquet file.")parsed_result=pd.read_parquet(parsed_data_path,engine="pyarrow")returncls(parsed_result,project_dir)
[docs]defstart_chunking(self,yaml_path:str):ifnotos.path.exists(self.project_dir):os.makedirs(self.project_dir)# Copy YAML file to the trial directoryshutil.copy(yaml_path,os.path.join(self.project_dir,"chunk_config.yaml"))# load yaml filemodules=load_yaml(yaml_path)input_modules,input_params=get_param_combinations(modules)logger.info("Chunking Start...")run_chunker(modules=input_modules,module_params=input_params,parsed_result=self.parsed_raw,project_dir=self.project_dir,)logger.info("Chunking Done!")