[docs]classValidator:def__init__(self,qa_data_path:str,corpus_data_path:str):""" Initialize a Validator object. :param qa_data_path: The path to the QA dataset. Must be parquet file. :param corpus_data_path: The path to the corpus dataset. Must be parquet file. """# validate data pathsifnotos.path.exists(qa_data_path):raiseValueError(f"QA data path {qa_data_path} does not exist.")ifnotos.path.exists(corpus_data_path):raiseValueError(f"Corpus data path {corpus_data_path} does not exist.")ifnotqa_data_path.endswith(".parquet"):raiseValueError(f"QA data path {qa_data_path} is not a parquet file.")ifnotcorpus_data_path.endswith(".parquet"):raiseValueError(f"Corpus data path {corpus_data_path} is not a parquet file.")self.qa_data=pd.read_parquet(qa_data_path,engine="pyarrow")self.corpus_data=pd.read_parquet(corpus_data_path,engine="pyarrow")self.qa_data=cast_qa_dataset(self.qa_data)self.corpus_data=cast_corpus_dataset(self.corpus_data)
[docs]defvalidate(self,yaml_path:str,qa_cnt:int=5,random_state:int=42):# Determine the sample size and log a warning if qa_cnt is larger than available recordsavailable_records=len(self.qa_data)safe_sample_size=min(qa_cnt,available_records)# 먼저 safe_sample_size 계산ifsafe_sample_size<qa_cnt:logger.warning(f"Minimal Requested sample size ({qa_cnt}) is larger than available records ({available_records}). "f"Sampling will be limited to {safe_sample_size} records. ")# safe sample QA datasample_qa_df=self.qa_data.sample(n=safe_sample_size,random_state=random_state)sample_qa_df.reset_index(drop=True,inplace=True)# get doc_idtemp_qa_df=sample_qa_df.copy(deep=True)flatten_retrieval_gts=(temp_qa_df["retrieval_gt"].apply(lambdax:list(itertools.chain.from_iterable(x))).tolist())target_doc_ids=list(itertools.chain.from_iterable(flatten_retrieval_gts))# make sample corpus datasample_corpus_df=self.corpus_data.loc[self.corpus_data["doc_id"].isin(target_doc_ids)]sample_corpus_df.reset_index(drop=True,inplace=True)validate_qa_from_corpus_dataset(sample_qa_df,sample_corpus_df)# start Evaluate at temp project directorywith(tempfile.NamedTemporaryFile(suffix=".parquet",delete=False)asqa_path,tempfile.NamedTemporaryFile(suffix=".parquet",delete=False)ascorpus_path,tempfile.TemporaryDirectory(ignore_cleanup_errors=True)astemp_project_dir,):sample_qa_df.to_parquet(qa_path.name,index=False)sample_corpus_df.to_parquet(corpus_path.name,index=False)evaluator=Evaluator(qa_data_path=qa_path.name,corpus_data_path=corpus_path.name,project_dir=temp_project_dir,)evaluator.start_trial(yaml_path,skip_validation=True)qa_path.close()corpus_path.close()os.unlink(qa_path.name)os.unlink(corpus_path.name)logger.info("Validation complete.")