Skip to content
Snippets Groups Projects
Commit b55aa19a authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

fixed mypy issues

parent cec20282
No related branches found
No related tags found
No related merge requests found
...@@ -3,11 +3,8 @@ from typing import Callable ...@@ -3,11 +3,8 @@ from typing import Callable
from sklearn.datasets import fetch_20newsgroups from sklearn.datasets import fetch_20newsgroups
from tuhlbox.stringkernels import ( from tuhlbox.stringkernels import ( # legacy_intersection_kernel,; legacy_presence_kernel,; legacy_spectrum_kernel,
intersection_kernel, intersection_kernel,
legacy_intersection_kernel,
legacy_presence_kernel,
legacy_spectrum_kernel,
presence_kernel, presence_kernel,
spectrum_kernel, spectrum_kernel,
) )
...@@ -23,11 +20,11 @@ def benchmark(kernel_method: Callable) -> float: ...@@ -23,11 +20,11 @@ def benchmark(kernel_method: Callable) -> float:
kernels = [ kernels = [
intersection_kernel, intersection_kernel,
legacy_intersection_kernel, # legacy_intersection_kernel,
presence_kernel, presence_kernel,
legacy_presence_kernel, # legacy_presence_kernel,
spectrum_kernel, spectrum_kernel,
legacy_spectrum_kernel, # legacy_spectrum_kernel,
] ]
for kernel in kernels: for kernel in kernels:
......
...@@ -83,7 +83,7 @@ class Doc2VecTransformer(TransformerMixin, BaseEstimator): ...@@ -83,7 +83,7 @@ class Doc2VecTransformer(TransformerMixin, BaseEstimator):
def fit( def fit(
self, self,
documents: List[List[str]], documents: List[List[str]],
labels: Union[List[str], np.array] = None, labels: Union[List[str], np.ndarray] = None,
**fit_params: Any **fit_params: Any
) -> Doc2VecTransformer: ) -> Doc2VecTransformer:
"""Fit the model by learning the training corpus.""" """Fit the model by learning the training corpus."""
...@@ -99,7 +99,7 @@ class Doc2VecTransformer(TransformerMixin, BaseEstimator): ...@@ -99,7 +99,7 @@ class Doc2VecTransformer(TransformerMixin, BaseEstimator):
) )
return self return self
def transform(self, documents: List[List[str]]) -> np.array: def transform(self, documents: List[List[str]]) -> np.ndarray:
"""Infer the vectors for documents.""" """Infer the vectors for documents."""
documents = [[str(x) for x in document] for document in documents] documents = [[str(x) for x in document] for document in documents]
_sanity_check(documents) _sanity_check(documents)
......
...@@ -3,7 +3,7 @@ from __future__ import annotations ...@@ -3,7 +3,7 @@ from __future__ import annotations
import random import random
from collections import defaultdict from collections import defaultdict
from typing import List, Union, Dict from typing import Dict, List, Union
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
...@@ -75,13 +75,13 @@ class LifeVectorizer(BaseEstimator, TransformerMixin): ...@@ -75,13 +75,13 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
self.sample_type = sample_type self.sample_type = sample_type
self.force = force self.force = force
def fit(self, _x: List[str], _y: Union[List, np.array] = None) -> LifeVectorizer: def fit(self, _x: List[str], _y: Union[List, np.ndarray] = None) -> LifeVectorizer:
"""Fit the model.""" """Fit the model."""
return self return self
def sample( def sample(
self, self,
words: Union[List[str], np.array], words: List[str],
fragment_size: int, fragment_size: int,
method: str, method: str,
) -> List[List[str]]: ) -> List[List[str]]:
...@@ -120,7 +120,7 @@ class LifeVectorizer(BaseEstimator, TransformerMixin): ...@@ -120,7 +120,7 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
self, self,
document: List[str], document: List[str],
sample_size: int, sample_size: int,
) -> np.array: ) -> np.ndarray:
"""Extract features from a document given a sample size.""" """Extract features from a document given a sample size."""
if self.sample_type == "both": if self.sample_type == "both":
return np.concatenate( return np.concatenate(
...@@ -137,12 +137,12 @@ class LifeVectorizer(BaseEstimator, TransformerMixin): ...@@ -137,12 +137,12 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
document: List[str], document: List[str],
fragment_size: int, fragment_size: int,
method: str, method: str,
) -> np.array: ) -> np.ndarray:
samples = self.sample(document, fragment_size, method) samples = self.sample(document, fragment_size, method)
features = [] features_as_list = []
for sample in samples: for sample in samples:
features.append(get_features_for_sample(sample)) features_as_list.append(get_features_for_sample(sample))
features = np.array(features) features = np.array(features_as_list)
means = np.mean(features, axis=0) means = np.mean(features, axis=0)
stds = np.std(features, axis=0) stds = np.std(features, axis=0)
return np.concatenate( return np.concatenate(
...@@ -150,8 +150,8 @@ class LifeVectorizer(BaseEstimator, TransformerMixin): ...@@ -150,8 +150,8 @@ class LifeVectorizer(BaseEstimator, TransformerMixin):
) )
def transform( def transform(
self, x: List[List[str]], _y: Union[List, np.array] = None self, x: List[List[str]], _y: Union[List, np.ndarray] = None
) -> np.array: ) -> np.ndarray:
"""Calculate samples and extracts features from documents.""" """Calculate samples and extracts features from documents."""
ret = [] ret = []
for document in x: for document in x:
......
...@@ -4,7 +4,8 @@ from __future__ import annotations ...@@ -4,7 +4,8 @@ from __future__ import annotations
import logging import logging
import math import math
from collections import defaultdict from collections import defaultdict
from typing import Any, Dict, Iterable, List from functools import partial
from typing import Any, DefaultDict, Dict, Iterable, List
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
...@@ -17,7 +18,9 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin): ...@@ -17,7 +18,9 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin):
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize the model.""" """Initialize the model."""
self.t: Dict[str, np.array] = defaultdict(np.array) self.t: DefaultDict[str, np.ndarray] = defaultdict(
partial(np.ndarray, 0)
) # type:ignore
def fit( def fit(
self, X: List[str], y: Iterable[str], **_fit_params: Any self, X: List[str], y: Iterable[str], **_fit_params: Any
...@@ -58,9 +61,9 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin): ...@@ -58,9 +61,9 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin):
return self return self
def transform(self, X: Iterable[str], _y: Any = None) -> np.array: def transform(self, X: Iterable[str], _y: Any = None) -> np.ndarray:
"""Transform data due to previously learned frequencies.""" """Transform data due to previously learned frequencies."""
result = [] result: List[int] = []
for k in X: for k in X:
document_sum = 0 document_sum = 0
doc_words = k.split() doc_words = k.split()
...@@ -68,6 +71,6 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin): ...@@ -68,6 +71,6 @@ class SubFrequencyVectorizer(BaseEstimator, TransformerMixin):
if j not in self.t: if j not in self.t:
continue continue
tf = doc_words.count(j) tf = doc_words.count(j)
document_sum += self.t[j] * tf / len(doc_words) document_sum += int(self.t[j] * tf / len(doc_words))
result.append(document_sum) result.append(document_sum)
return np.array(result) return np.array(result)
"""Transformers working on NLTK tree objects.""" """Transformers working on NLTK tree objects."""
from __future__ import annotations from __future__ import annotations
from typing import Any, Iterable, List, Union from typing import Any, List, Union
import nltk import nltk
import numpy as np import numpy as np
...@@ -178,7 +178,7 @@ class TreeChainTransformer(BaseEstimator, TransformerMixin): ...@@ -178,7 +178,7 @@ class TreeChainTransformer(BaseEstimator, TransformerMixin):
return result return result
def _get_average_height(document: List[nltk.Tree]) -> np.array: def _get_average_height(document: List[nltk.Tree]) -> float:
"""Calculate the average height of all trees in a document.""" """Calculate the average height of all trees in a document."""
return np.average([tree.height() for tree in document]) return np.average([tree.height() for tree in document])
...@@ -194,14 +194,14 @@ def _calculate_average_children(tree: nltk.Tree) -> List[int]: ...@@ -194,14 +194,14 @@ def _calculate_average_children(tree: nltk.Tree) -> List[int]:
return result return result
def _get_average_children(document: List[nltk.Tree]) -> np.array: def _get_average_children(document: List[nltk.Tree]) -> float:
result = [] result = []
for tree in document: for tree in document:
result += _calculate_average_children(tree) result += _calculate_average_children(tree)
return np.average(result) return np.average(result)
def _get_average_inner_to_leaf_ratio(document: List[nltk.Tree]) -> np.array: def _get_average_inner_to_leaf_ratio(document: List[nltk.Tree]) -> float:
result = [] result = []
for tree in document: for tree in document:
num_leaves = len(tree.leaves()) num_leaves = len(tree.leaves())
...@@ -218,7 +218,7 @@ def _get_max_tree_width(tree: nltk.Tree) -> int: ...@@ -218,7 +218,7 @@ def _get_max_tree_width(tree: nltk.Tree) -> int:
return maximum return maximum
def _get_max_child_width(document: List[nltk.Tree]) -> np.array: def _get_max_child_width(document: List[nltk.Tree]) -> float:
result = [] result = []
for tree in document: for tree in document:
result.append(_get_max_tree_width(tree)) result.append(_get_max_tree_width(tree))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment