[docs]defvalidate_qa_dataset(df:pd.DataFrame):columns=["qid","query","retrieval_gt","generation_gt"]assertset(columns).issubset(df.columns),f"df must have columns {columns}, but got {df.columns}"
[docs]defvalidate_corpus_dataset(df:pd.DataFrame):columns=["doc_id","contents","metadata"]assertset(columns).issubset(df.columns),f"df must have columns {columns}, but got {df.columns}"
[docs]defcast_qa_dataset(df:pd.DataFrame):defcast_retrieval_gt(gt):ifisinstance(gt,str):return[[gt]]elifisinstance(gt,list):ifisinstance(gt[0],str):return[gt]elifisinstance(gt[0],list):returngtelifisinstance(gt[0],np.ndarray):returncast_retrieval_gt(list(map(lambdax:x.tolist(),gt)))else:raiseValueError(f"retrieval_gt must be str or list, but got {type(gt[0])}")elifisinstance(gt,np.ndarray):returncast_retrieval_gt(gt.tolist())else:raiseValueError(f"retrieval_gt must be str or list, but got {type(gt)}")defcast_generation_gt(gt):ifisinstance(gt,str):return[gt]elifisinstance(gt,list):returngtelifisinstance(gt,np.ndarray):returncast_generation_gt(gt.tolist())else:raiseValueError(f"generation_gt must be str or list, but got {type(gt)}")df=df.reset_index(drop=True)validate_qa_dataset(df)assertdf["qid"].apply(lambdax:isinstance(x,str)).sum()==len(df),"qid must be string type."assertdf["query"].apply(lambdax:isinstance(x,str)).sum()==len(df),"query must be string type."df["retrieval_gt"]=df["retrieval_gt"].apply(cast_retrieval_gt)df["generation_gt"]=df["generation_gt"].apply(cast_generation_gt)df["query"]=df["query"].apply(preprocess_text)df["generation_gt"]=df["generation_gt"].apply(lambdax:list(map(preprocess_text,x)))returndf
[docs]defcast_corpus_dataset(df:pd.DataFrame):df=df.reset_index(drop=True)validate_corpus_dataset(df)# drop rows that have empty contentsdf=df[~df["contents"].apply(lambdax:xisNoneorx.isspace())]defmake_datetime_metadata(x):ifxisNoneorx=={}:return{"last_modified_datetime":datetime.now()}elifx.get("last_modified_datetime")isNone:return{**x,"last_modified_datetime":datetime.now()}else:returnxdf["metadata"]=df["metadata"].apply(make_datetime_metadata)# check every metadata have a datetime keyassertsum(df["metadata"].apply(lambdax:x.get("last_modified_datetime")isnotNone))==len(df),"Every metadata must have a datetime key."defmake_prev_next_id_metadata(x,id_type:str):ifxisNoneorx=={}:return{id_type:None}elifx.get(id_type)isNone:return{**x,id_type:None}else:returnxdf["metadata"]=df["metadata"].apply(lambdax:make_prev_next_id_metadata(x,"prev_id"))df["metadata"]=df["metadata"].apply(lambdax:make_prev_next_id_metadata(x,"next_id"))df["contents"]=df["contents"].apply(preprocess_text)defnormalize_unicode_metadata(metadata:dict):result={}forkey,valueinmetadata.items():ifisinstance(value,str):result[key]=preprocess_text(value)else:result[key]=valuereturnresultdf["metadata"]=df["metadata"].apply(normalize_unicode_metadata)# check every metadata have a prev_id, next_id keyassertall("prev_id"inmetadataformetadataindf["metadata"]),"Every metadata must have a prev_id key."assertall("next_id"inmetadataformetadataindf["metadata"]),"Every metadata must have a next_id key."returndf
[docs]defvalidate_qa_from_corpus_dataset(qa_df:pd.DataFrame,corpus_df:pd.DataFrame):qa_ids=[]forretrieval_gtinqa_df["retrieval_gt"].tolist():ifisinstance(retrieval_gt,list)and(retrieval_gt[0]!=[]orany(bool(g)isTrueforginretrieval_gt)):forgtinretrieval_gt:qa_ids.extend(gt)elifisinstance(retrieval_gt,np.ndarray)andretrieval_gt[0].size>0:forgtinretrieval_gt:qa_ids.extend(gt)no_exist_ids=list(filter(lambdaqa_id:corpus_df[corpus_df["doc_id"]==qa_id].empty,qa_ids))assert(len(no_exist_ids)==0),f"{len(no_exist_ids)} doc_ids in retrieval_gt do not exist in corpus_df."