Skip to content
Snippets Groups Projects
Commit b55aa19a authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

fixed mypy issues

parent cec20282
No related branches found
No related tags found
No related merge requests found
......@@ -3,11 +3,8 @@ from typing import Callable
from sklearn.datasets import fetch_20newsgroups
from tuhlbox.stringkernels import (
from tuhlbox.stringkernels import ( # legacy_intersection_kernel,; legacy_presence_kernel,; legacy_spectrum_kernel,
intersection_kernel,
legacy_intersection_kernel,
legacy_presence_kernel,
legacy_spectrum_kernel,
presence_kernel,
spectrum_kernel,
)
......@@ -23,11 +20,11 @@ def benchmark(kernel_method: Callable) -> float:
kernels = [
intersection_kernel,
legacy_intersection_kernel,
# legacy_intersection_kernel,
presence_kernel,
legacy_presence_kernel,
# legacy_presence_kernel,
spectrum_kernel,
legacy_spectrum_kernel,
# legacy_spectrum_kernel,
]
for kernel in kernels:
......
......@@ -83,7 +83,7 @@ class Doc2VecTransformer(TransformerMixin, BaseEstimator):
def fit(
self,
documents: List[List[str]],
labels: Union[List[str], np.array] = None,
labels: Union[List[str], np.ndarray] = None,
**fit_params: Any
) -> Doc2VecTransformer:
"""Fit the model by learning the training corpus."""
......@@ -99,7 +99,7 @@ class Doc2VecTransformer(TransformerMixin, BaseEstimator):
)
return self
def transform(self, documents: List[List[str]]) -> np.array:
def transform(self, documents: List[List[str]]) -> np.ndarray:
"""Infer the vectors for documents."""
documents = [[str(x) for x in document] for document in documents]
_sanity_check(documents)
......
......@@ -3,7 +3,7 @@ from __future__ import annotations
import random
from collections import defaultdict
from typing import List, Union, Dict
from typing import Dict, List, Union
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
......@@ -75,13 +75,13 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
self.sample_type = sample_type
self.force = force
def fit(self, _x: List[str], _y: Union[List, np.array] = None) -> LifeVectorizer:
def fit(self, _x: List[str], _y: Union[List, np.ndarray] = None) -> LifeVectorizer:
"""Fit the model."""
return self
def sample(
self,
words: Union[List[str], np.array],
words: List[str],
fragment_size: int,
method: str,
) -> List[List[str]]:
......@@ -120,7 +120,7 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
self,
document: List[str],
sample_size: int,
) -> np.array:
) -> np.ndarray:
"""Extract features from a document given a sample size."""
if self.sample_type == "both":
return np.concatenate(
......@@ -137,12 +137,12 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
document: List[str],
fragment_size: int,
method: str,
) -> np.array:
) -> np.ndarray:
samples = self.sample(document, fragment_size, method)
features = []
features_as_list = []
for sample in samples:
features.append(get_features_for_sample(sample))
features = np.array(features)
features_as_list.append(get_features_for_sample(sample))
features = np.array(features_as_list)
means = np.mean(features, axis=0)
stds = np.std(features, axis=0)
return np.concatenate(
......@@ -150,8 +150,8 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
)
def transform(
self, x: List[List[str]], _y: Union[List, np.array] = None
) -> np.array:
self, x: List[List[str]], _y: Union[List, np.ndarray] = None
) -> np.ndarray:
"""Calculate samples and extracts features from documents."""
ret = []
for document in x:
......
......@@ -4,7 +4,8 @@ from __future__ import annotations
import logging
import math
from collections import defaultdict
from typing import Any, Dict, Iterable, List
from functools import partial
from typing import Any, DefaultDict, Dict, Iterable, List
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
......@@ -17,7 +18,9 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin):
def __init__(self) -> None:
"""Initialize the model."""
self.t: Dict[str, np.array] = defaultdict(np.array)
self.t: DefaultDict[str, np.ndarray] = defaultdict(
partial(np.ndarray, 0)
) # type:ignore
def fit(
self, X: List[str], y: Iterable[str], **_fit_params: Any
......@@ -58,9 +61,9 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin):
return self
def transform(self, X: Iterable[str], _y: Any = None) -> np.array:
def transform(self, X: Iterable[str], _y: Any = None) -> np.ndarray:
"""Transform data due to previously learned frequencies."""
result = []
result: List[int] = []
for k in X:
document_sum = 0
doc_words = k.split()
......@@ -68,6 +71,6 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin):
if j not in self.t:
continue
tf = doc_words.count(j)
document_sum += self.t[j] * tf / len(doc_words)
document_sum += int(self.t[j] * tf / len(doc_words))
result.append(document_sum)
return np.array(result)
"""Transformers working on NLTK tree objects."""
from __future__ import annotations
from typing import Any, Iterable, List, Union
from typing import Any, List, Union
import nltk
import numpy as np
......@@ -178,7 +178,7 @@ class TreeChainTransformer(BaseEstimator, TransformerMixin):
return result
def _get_average_height(document: List[nltk.Tree]) -> np.array:
def _get_average_height(document: List[nltk.Tree]) -> float:
"""Calculate the average height of all trees in a document."""
return np.average([tree.height() for tree in document])
......@@ -194,14 +194,14 @@ def _calculate_average_children(tree: nltk.Tree) -> List[int]:
return result
def _get_average_children(document: List[nltk.Tree]) -> np.array:
def _get_average_children(document: List[nltk.Tree]) -> float:
result = []
for tree in document:
result += _calculate_average_children(tree)
return np.average(result)
def _get_average_inner_to_leaf_ratio(document: List[nltk.Tree]) -> np.array:
def _get_average_inner_to_leaf_ratio(document: List[nltk.Tree]) -> float:
result = []
for tree in document:
num_leaves = len(tree.leaves())
......@@ -218,7 +218,7 @@ def _get_max_tree_width(tree: nltk.Tree) -> int:
return maximum
def _get_max_child_width(document: List[nltk.Tree]) -> np.array:
def _get_max_child_width(document: List[nltk.Tree]) -> float:
result = []
for tree in document:
result.append(_get_max_tree_width(tree))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment