analytics.py 12.5 KB
Newer Older
Michael Vötter's avatar
Michael Vötter committed
1
"""Module containing tools to run result analytics."""
Benjamin Murauer's avatar
Benjamin Murauer committed
2
import numpy as np
Michael Vötter's avatar
Michael Vötter committed
3
4
5
6
import pandas as pd

from .db import DB
from .db import DbModel
7
from .utils import LOGGER
Michael Vötter's avatar
Michael Vötter committed
8
9
10
11
12
13
14
15
from .utils import load_project_config


def _extract_metric_results(outcome, requested_metric):
    return outcome.apply(
        lambda row: row.apply(lambda value: value[requested_metric]))


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def get_results_as_dataframe(project_name,
                             table_name='results',
                             filter_git_dirty=True):
    """Returns the results stored in the database as a pandas dataframe.

    Args:
        project_name: the project name to fetch results.
        table_name: the name of the reults table.
        filter_git_dirty: defines if dirty commits are filterd.
    """
    results = pd.read_sql_table(table_name=table_name, con=DB.engine)

    if filter_git_dirty:
        results = results[results['git_is_dirty'] == False]  # noqa: E712

    return results[results['project_name'] == project_name]


Michael Vötter's avatar
Michael Vötter committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
def fetch_by_git_commit_id(git_commit_id):
    """Returns a query that is filtered by git commit id.

    Args:
        git_commit_id: the commit id of the returned results.
    """
    session = DB.session()
    return session.query(DbModel).filter(
        DbModel.git_commit_id == git_commit_id)


def fetch_by_row_ids(from_id, to_id=None):
    """Returns a query in the given range of ids.

    Args:
        from_id: the smalles database id included in the results of the query.
        to_id: if specified, this is the biggest database id included in the
            results of the query.
    """
    session = DB.session()
    query = session.query(DbModel).filter(DbModel.id >= from_id)
    if to_id is not None:
        return query.filter(DbModel.id <= to_id)
    return query


def fetch_by_project_name(project=None):
    """Returns a query filtered by project.

    Args:
        project: the name of the project to extract results. If None, the
            project in the dbispipeline.ini is used.
    """
    if project is None:
        project = load_project_config()['project']['name']

    session = DB.session()
    return session.query(DbModel).filter(DbModel.project_name == project)


def get_cv_epoch_evaluator_results(requested_metric=None,
                                   query_function=fetch_by_project_name):
    """Extracts CvEpochEvaluator results from the database.

    Args:
        requested_metric: allows to restrict the results to a single metric.
        query_function: a function returing a SQLAlchemy query when called.

    Returns: A tuple containing the prepared results as first element and the
        whole db entry as the second entry. The prepared results are eighter a
        pandas dataframe if a metric is requested or a dict containing a pandas
        dataframe per metric.
    """
    for run in query_function():
        if run.evaluator['class'] == 'CvEpochEvaluator':
            outcome = pd.DataFrame(run.outcome)

            if requested_metric is None:
                results = {}
                for available_metric in run.evaluator['scoring']:
                    results[available_metric] = _extract_metric_results(
                        outcome,
                        available_metric,
                    )
                yield results, run
            elif requested_metric in run.evaluator['scoring']:
                yield _extract_metric_results(outcome, requested_metric), run
Benjamin Murauer's avatar
Benjamin Murauer committed
101
102


103
104
def rows_to_dataframe(rows,
                      allow_git_different_rows=False,
Benjamin Murauer's avatar
Benjamin Murauer committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
                      allow_git_dirty_rows=False):
    """
    Converts database rows to a pandas DataFrame.

    args:
        rows: some object that iterates over rows. May be a query result or an
            actual list of rows. If this field is None or empty, an empty
            DataFrame will be returned.
        allow_git_dirty_rows: if set to true, allows that rows might have
            different git commit ids. Otherwise, an exception is thrown.
        allow_git_dirty_rows: if set to true, allows that rows have a dirty git
            status. Otherwise, an exception is thrown.

    returns: a pandas DataFrame with all columns of the database as columns.
    """
    # rows might be a query, which is not yet fetched from the database.
    if type(rows) not in [list, np.array]:
        rows = list(rows)

    if rows is None or len(rows) == 0:
        return pd.DataFrame()

    git_ids = set([row.git_commit_id for row in rows])
    if not allow_git_different_rows and len(git_ids) > 1:
        raise ValueError(f'your result contains multiple git ids: {git_ids}')

    # the DbModel objects will have additional columns that are not interesting
    # for the underlying application. In the __table__.columns field, the
    # actual list of "payload"-columns is stored.
    columns = [column.name for column in rows[0].__table__.columns]
    df_rows = []
    for row in rows:

        if not allow_git_dirty_rows and row.git_is_dirty:
            raise ValueError('your result contains dirty git rows')

        row_result = {}
        for column in columns:
            row_result[column] = getattr(row, column)
        df_rows.append(row_result)

    return pd.DataFrame(df_rows)


def _read_parameters(dictionary, prefix='', use_prefix=True):
    """Recursive helper method for extracting GridSearch param information."""
    if len(dictionary) == 0:
        return dictionary
    result = {}
    for k, v in dictionary.items():
155
156
157
158
159
160
161
        # PipelineHelper is the root of the model
        if k == 'selected_model':
            k = prefix + k if use_prefix else k
            result[k] = str(v[0])
            result.update(_read_parameters(v[1], v[0] + '__'))
        # PipelineHelper is somewhere else
        elif '__selected_model' in k:
Benjamin Murauer's avatar
Benjamin Murauer committed
162
163
164
            # the [:-16] cuts off the string '__selected_model'
            key_name = prefix + k[:-16] if use_prefix else k[:-16]
            result[key_name] = str(v[0])
165
166
            result.update(_read_parameters(v[1], v[0] + '__'))
        # "Regular" GridSearch element
Benjamin Murauer's avatar
Benjamin Murauer committed
167
168
169
170
171
172
173
        else:
            key_name = prefix + k if use_prefix else k
            result[key_name] = f'{v}'
    return result


def extract_gridsearch_parameters(
174
175
176
    df,
    score_name,
    drop_outcome=True,
Benjamin Murauer's avatar
Benjamin Murauer committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
    prefix_parameter_names=True,
):
    """
    Extracts parameters from a grid search result.

    This method creates one DataFrame row for each parameter combination in the
    "outcome -> cv_results" field, and one column of each distinct parameter.
    For example, if your grid search contains a parameter `svm__C: [1, 10]`,
    then this method will add a column `C` to your DataFrame, and replace this
    row with two rows for the values 1 and 10.

    This method will recursively resolve parameters used in PipelineHelpers.

    Depending on your configurations, the output of this method may make the
    row ids no longer unique.

    before:
    row0 = {
        dataloader: XY,
        outcome: {
            cv_results: {
                params: {
                    1: { SVM__C:  1},
                    2: { SVM__C: 10},
                }
            }
        }
    }

    after:
    row0 = {
        dataloader: XY,
        SVM__C: 1,
    }
    row1 = {
        dataloader: XY,
        SVM__C: 10,
    }

    args:
        df: a pandas DataFrame object that has one column 'outcome', which
            contains dictionaries which have a field 'cv_results'. Notably,
            this is the case for the result of the rows_to_dataframe method.
        score_name: name of the field that the score should be extracted from.
221
222
            This can be a single string (e.g., 'mean_test_score') or a list of
            strings.
Benjamin Murauer's avatar
Benjamin Murauer committed
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
        drop_outcome: if true, the resulting dataframe will no longer have the
            original 'outcome' column.
        prefix_parameter_names: This parameter only affects models which have
            a PipelineHelper. If true, the resulting parameter names are
            returned by their full name. If set to false, only the part within
            the PipelineHelper is returned.
            Note that omitting this prefix may result in multiple parameters
            with the same name, possibly leading to grouping unrelated fields.

    returns: a pandas DataFrame with all possible parameters as columns, and
        all distinct parameter combinations as rows.
    """
    result_rows = []
    for _, row in df.iterrows():
        if 'outcome' not in row or 'cv_results' not in row['outcome']:
            raise ValueError(
                'this result set does not seem to contain grid '
                "search results, missing field: row['outcome']['cv_results']")
        cv = row['outcome']['cv_results']
        for combination_id, combination in cv['params'].items():
            # prevent unnecessary copy of the outcome field, which may be big
            result_row = {k: v for k, v in row.items() if k != 'outcome'}
            if not drop_outcome:
                result_row['outcome'] = row['outcome']
247
248
249
250
251
252

            if type(score_name) == str:
                score_name = [score_name]

            for score in score_name:
                result_row[score] = cv[score][combination_id]
Benjamin Murauer's avatar
Benjamin Murauer committed
253
254
255
256
257
            result_row.update(
                _read_parameters(combination,
                                 use_prefix=prefix_parameter_names))
            result_rows.append(result_row)
    return pd.DataFrame(result_rows)
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315


def load_dataframe(allow_multiple_git_ids=False, allow_dirty_rows=False,
                   **query_filters):
    """
    A flexible wrapper for queries.

    Args:
        allow_multiple_git_ids (bool): If false, raises an error when loading
            results with different git commit ids
        allow_dirty_rows (bool): If false, raises an error when loading rows
            resulting from a dirty git state
        **query_filters: key-value restrictions on the DB query. The values
            can have three different representations:
            - single vaules, then they will be matched with ==
            - comma separated values, then they will be matched with 'in (...)'
            - values separated with ' to ', then they will be matched with >=
                and <= if one of the two borders is an asterisk, only the other
                border will be used. Couldn't use the dash, as it is needed in
                date-based queries.

    Returns:
        A dataframe resulting from the query, and an object containing
        additional information of the git and row ids used in this result, and
        a list of all scores that were found in the result.

    Examples:
        - load a single git commit:
            df, info = load_dataframe(git_commit_id='2aace91d317694a08...')
        - load all results between two row ids
            df, info = load_dataframe(id="1200 to 1300")
        - specifiy multiple query filters
            df, info = load_dataframe(
                git_commit_id="...",
                sourcefile="plan1.py,plan2.py",
            )
    """
    session = DB.session()
    query = session.query(DbModel)
    for field, value in query_filters.items():
        if not value:
            continue

        if ' to ' in value:
            min_val, max_val = value.split(' to ')
            if min_val != '*':
                query = query.filter(getattr(DbModel, field) >= min_val)
            if max_val != '*':
                query = query.filter(getattr(DbModel, field) <= max_val)
        elif ',' in value:
            query = query.filter(getattr(DbModel, field).in_(value.split(',')))
        else:
            query = query.filter(getattr(DbModel, field) == value)

    df = rows_to_dataframe(query, allow_multiple_git_ids, allow_dirty_rows)
    LOGGER.debug('loaded %s raw rows from db', df.shape[0])
    extra_info = {}
    scores = None
316
    has_cv_results = False
317
    for _, row in df.iterrows():
318
319
320
321

        if 'cv_results' in row:
            has_cv_results = True

322
323
324
325
326
327
328
329
330
331
        row_scores = set([k for k in row['outcome']['cv_results'].keys()
                          if k.startswith('mean_test_')])
        if scores is None:
            scores = row_scores
        else:
            if scores - row_scores or row_scores - scores:
                raise ValueError(
                    'you have loaded rows that have different scoring fields, '
                    f'which is not supported: {scores} vs. {row_scores}')

332
    LOGGER.debug('extracted scores: %s', scores)
333
334
335
    extra_info['scores'] = scores
    extra_info['git_ids'] = set(df['git_commit_id'].values)
    extra_info['row_ids'] = set(df['id'].values)
336
337
    if has_cv_results:
        df = extract_gridsearch_parameters(df, scores)
338
    return df, extra_info