Commit 16e42bb5 authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

release r0.8.30

parent 4a146315
Pipeline #72664 passed with stage
in 2 minutes and 30 seconds
[bumpversion]
current_version = 0.8.29
current_version = 0.8.30
[bumpversion:file:setup.py]
......
......@@ -11,7 +11,7 @@ with open('README.md') as fh:
setup(
name='dbispipeline',
version='0.8.29',
version='0.8.30',
author='Benjamin Murauer, Michael Vötter',
author_email='b.murauer@posteo.de',
description='should make things more reproducible',
......
......@@ -31,8 +31,9 @@ def _limit(
) -> Tuple[pd.DataFrame, np.array]:
df, key = _attach(dataset_part[0], dataset_part[1])
sub_df = df[df[key].isin(remaining_targets)]
first_column = sub_df.columns[0]
groups = sub_df.groupby(key)
min_population = groups.count()['text_raw'].min()
min_population = groups.count()[first_column].min()
if max_docs_per_target:
if min_population > max_docs_per_target:
sub_df = sub_df.groupby(key).sample(max_docs_per_target)
......@@ -178,6 +179,7 @@ class TrainTestSplitLoader(Loader):
self.max_targets,
len(all_targets),
)
selected_targets = list(all_targets)
else:
selected_targets = list(all_targets)
train = _limit(train, selected_targets,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment