release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output...

release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output to nested list instead of strings.

release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output...
release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output to nested list instead of strings.
e12bd853 · Benjamin Murauer · 375b3ee9 · e12bd853 · e12bd853 · e12bd853
Commit e12bd853 authored 3 years ago by Benjamin Murauer
--- a/pyproject.toml
+++ b/pyproject.toml
 [tool.poetry]
 name = "tuhlbox"
-version = "0.4.10"
+version = "0.4.11"
 homepage = "https://git.uibk.ac.at/csak8736/tuhlbox"
 description = "Personal toolbox of language processing models."
 authors = ["Benjamin Murauer <b.murauer@posteo.de>"]

--- a/tests/test_stanza.py
+++ b/tests/test_stanza.py
+from typing import List
+from tuhlbox.stanza import StanzaNlpToFieldTransformer, StanzaParserTransformer
+
+sentences: List
+
+
+def setup_function() -> None:
+    global sentences
+    documents = [
+        "I have been trying to reach you. This is another sentence in this document.",
+        "This is a second document, independent from the others.",
+    ]
+    parser = StanzaParserTransformer("en", silent=True)
+    sentences = parser.transform(documents)
+
+
+def test_stanza_nlp_to_field_transformer_xpos() -> None:
+    transformer = StanzaNlpToFieldTransformer("xpos")
+    expected = [
+        [  # document 1
+            ["PRP", "VBP", "VBN", "VBG", "TO", "VB", "PRP", "."],  # sentence 1
+            ["DT", "VBZ", "DT", "NN", "IN", "DT", "NN", "."],  # sentence 2
+        ],
+        [  # document 2 (one sentence)
+            ["DT", "VBZ", "DT", "JJ", "NN", ",", "JJ", "IN", "DT", "NNS", "."],
+        ],
+    ]
+    actual = transformer.transform(sentences)
+    assert expected == actual
+
+
+def test_stanza_nlp_to_field_transformer_upos() -> None:
+    transformer = StanzaNlpToFieldTransformer("upos")
+    expected = [
+        [  # document 1
+            ["PRON", "AUX", "AUX", "VERB", "PART", "VERB", "PRON", "PUNCT"],
+            ["PRON", "AUX", "DET", "NOUN", "ADP", "DET", "NOUN", "PUNCT"],
+        ],
+        [  # document 2
+            [
+                "PRON",
+                "AUX",
+                "DET",
+                "ADJ",
+                "NOUN",
+                "PUNCT",
+                "ADJ",
+                "ADP",
+                "DET",
+                "NOUN",
+                "PUNCT",
+            ]
+        ],
+    ]
+    actual = transformer.transform(sentences)
+    assert expected == actual
--- a/tuhlbox/stanza.py
+++ b/tuhlbox/stanza.py
@@ -106,11 +106,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
    """
    Flattens a stanford document in the same order as the parsed text.

-    Input (Document | Sentence):
-        each document is expected to be a StanfordNLP document or sentence.
-    Output: each document is returned as a single string, where each word is
-        represented as the field that is to be extracted, separated by spaces.
-        Sentences are separated by newlines.
+    Input document:
+        each document is expected to be a StanfordNLP document.
+    Output document: each document is returned as a list of sentences; and each sentence
+        is a list of tags (=strings)
+
    """

    def __init__(self, field: str):
@@ -128,11 +128,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):

    def transform(
        self, x: Iterable[Union[str, Document, Sentence]], _y: Any = None
-    ) -> List[str]:
+    ) -> List[List[List[str]]]:
        """Transform documents."""
-        result = []
+        result: List[List[List[str]]] = []
        for document in x:
-            document_result: List[str] = []
+            document_result: List[List[str]] = []
            sentences: List[Sentence] = []
            if isinstance(document, Sentence):
                sentences = [document]
@@ -145,8 +145,8 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
                    if label is None:
                        raise ValueError(f"label is None for word: {word}")
                    new_sentence.append(label)
-                document_result.append(" ".join(new_sentence))
-            result.append("\n".join(document_result))
+                document_result.append(new_sentence)
+            result.append(document_result)
        return result