import functools
import time
from typing import List, Iterable, Tuple, Any, Optional, Callable
import numpy as np
import pandas as pd
[docs]
def measure_speed(func, *args, **kwargs):
"""
Method for measuring execution speed of the function.
"""
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
return result, end_time - start_time
[docs]
def avoid_empty_result(return_index: List[int]):
"""
Decorator for avoiding empty results from the function.
When the func returns an empty result, it will return the origin results.
When the func returns a None, it will return the origin results.
When the return value is a tuple, it will check all the value or list is empty.
If so, it will return the origin results.
It keeps parameters at return_index of the function as the origin results.
:param return_index: The index of the result to be returned when there is no result.
:return: The origin results or the results from the function.
"""
def decorator_avoid_empty_result(func: Callable):
@functools.wraps(func)
def wrapper(*args, **kwargs) -> List:
func_result = func(*args, **kwargs)
if isinstance(func_result, tuple):
# if all the results are empty, return the origin results.
if all([not bool(result) for result in func_result]):
return [args[index] for index in return_index]
if not bool(func_result):
return [args[index] for index in return_index]
else:
return func_result
return wrapper
return decorator_avoid_empty_result
[docs]
@avoid_empty_result([0, 3])
def filter_by_threshold(results, value, threshold, metadatas=None) -> Tuple[List, List]:
"""
Filter results by value's threshold.
:param results: The result list to be filtered.
:param value: The value list to be filtered.
It must have the same length with results.
:param threshold: The threshold value.
:param metadatas: The metadata of each result.
:return: Filtered list of results and filtered list of metadatas.
Metadatas will be returned even if you did not give input metadatas.
:rtype: Tuple[List, List]
"""
if metadatas is None:
metadatas = [None] * len(results)
assert len(results) == len(value), "results and value must have the same length."
try:
filtered_results, _, filtered_metadatas = zip(
*filter(lambda x: x[1] <= threshold, zip(results, value, metadatas))
)
except ValueError:
return [], []
return list(filtered_results), list(filtered_metadatas)
[docs]
def select_best(
results: List[pd.DataFrame],
columns: Iterable[str],
metadatas: Optional[List[Any]] = None,
strategy_name: str = "mean",
) -> Tuple[pd.DataFrame, Any]:
strategy_func_dict = {
"mean": select_best_average,
"rank": select_best_rr,
"normalize_mean": select_normalize_mean,
}
if strategy_name not in strategy_func_dict:
raise ValueError(
f"Input strategy name {strategy_name} is not in {strategy_func_dict.keys()}"
)
return strategy_func_dict[strategy_name](results, columns, metadatas)
[docs]
def select_best_average(
results: List[pd.DataFrame],
columns: Iterable[str],
metadatas: Optional[List[Any]] = None,
) -> Tuple[pd.DataFrame, Any]:
"""
Select the best result by average value among given columns.
:param results: The list of results.
Each result must be pd.DataFrame.
:param columns: Column names to be averaged.
Standard to select the best result.
:param metadatas: The metadata of each result.
It will select one metadata with the best result.
:return: The best result and the best metadata.
The metadata will be returned even if you did not give input 'metadatas' parameter.
:rtype: Tuple[pd.DataFrame, Any]
"""
results, columns, metadatas = validate_strategy_inputs(results, columns, metadatas)
each_average = [df[columns].mean(axis=1).mean() for df in results]
best_index = each_average.index(max(each_average))
return results[best_index], metadatas[best_index]
[docs]
def select_best_rr(
results: List[pd.DataFrame],
columns: Iterable[str],
metadatas: Optional[List[Any]] = None,
) -> Tuple[pd.DataFrame, Any]:
results, columns, metadatas = validate_strategy_inputs(results, columns, metadatas)
each_average_df = pd.DataFrame(
[df[columns].mean(axis=0).to_dict() for df in results]
)
rank_df = each_average_df.rank(ascending=False)
rr_df = rank_df.map(lambda x: 1 / x)
best_index = np.array(rr_df.sum(axis=1)).argmax()
return results[best_index], metadatas[best_index]
[docs]
def select_normalize_mean(
results: List[pd.DataFrame],
columns: Iterable[str],
metadatas: Optional[List[Any]] = None,
) -> Tuple[pd.DataFrame, Any]:
results, columns, metadatas = validate_strategy_inputs(results, columns, metadatas)
each_mean_df = pd.DataFrame([df[columns].mean(axis=0).to_dict() for df in results])
normalized_means = (each_mean_df - each_mean_df.min()) / (
each_mean_df.max() - each_mean_df.min()
)
normalized_mean_sums = normalized_means.sum(axis=1)
best_index = normalized_mean_sums.argmax()
return results[best_index], metadatas[best_index]