[docs]defgenerate_qa_row(llm,corpus_data_row):""" this sample code to generate rag dataset using OpenAI chat model :param llm: guidance model :param corpus_data_row: need "contents" column :return: should to be dict which has "query", "generation_gt" columns at least. """fromguidanceimportgenimportguidancetemp_llm=llmwithguidance.user():temp_llm+=f""" You have to found a passge to solve "the problem". You need to build a clean and clear set of (problem, passage, answer) in json format so that you don't have to ask about "the problem" again. problem need to end with question mark("?"). The process of approaching the answer based on the information of the given passage must be clearly and neatly displayed in the answer.\n\n Here is set of (problem, passage, answer) in JSON format:\n{{\n "passage": {corpus_data_row["contents"]}\n "problem": """withguidance.assistant():temp_llm+=gen("query",stop="?")withguidance.user():temp_llm+=""" "answer": """withguidance.assistant():temp_llm+=gen("generation_gt")corpus_data_row["metadata"]["qa_generation"]="simple"response={"query":temp_llm["query"],"generation_gt":temp_llm["generation_gt"]}returnresponse
[docs]defgenerate_simple_qa_dataset(llm,corpus_data:pd.DataFrame,output_filepath:str,generate_row_function:Callable,**kwargs,):""" corpus_data to qa_dataset qa_dataset will be saved to filepath(file_dir/filename) :param llm: guidance.models.Model :param corpus_data: pd.DataFrame. refer to the basic structure :param output_filepath: file_dir must exist, filepath must not exist. file extension must be .parquet :param generate_row_function: input(llm, corpus_data_row, kwargs) output(dict[columns contain "query" and "generation_gt"]) :param kwargs: if generate_row_function requires more args, use kwargs :return: qa_dataset as pd.DataFrame """output_file_dir=pathlib.PurePath(output_filepath).parentifnotos.path.isdir(output_file_dir):raiseNotADirectoryError(f"directory {output_file_dir} not found.")ifnotoutput_filepath.endswith("parquet"):raiseNameError(f'file path: {output_filepath} filename extension need to be ".parquet"')ifos.path.exists(output_filepath):raiseFileExistsError(f"{output_filepath.split('/')[-1]} already exists in {output_file_dir}.")qa_data_lst=[]for_,corpus_data_rowincorpus_data.iterrows():response=generate_row_function(llm=llm,corpus_data_row=corpus_data_row,**kwargs)qa_data_lst.append({"qid":str(uuid.uuid4()),"query":response["query"],"retrieval_gt":[[corpus_data_row["doc_id"]]],"generation_gt":[response["generation_gt"]],"metadata":corpus_data_row["metadata"],})qa_dataset=pd.DataFrame(qa_data_lst)qa_dataset.to_parquet(output_filepath,index=False)returnqa_dataset