Skip to content
Snippets Groups Projects
Commit e12bd853 authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output...

release v0.4.11: added tests for StanzaToNlpFieldTransformer, changed output to nested list instead of strings.
parent 375b3ee9
No related branches found
No related tags found
No related merge requests found
Pipeline #51213 failed
[tool.poetry] [tool.poetry]
name = "tuhlbox" name = "tuhlbox"
version = "0.4.10" version = "0.4.11"
homepage = "https://git.uibk.ac.at/csak8736/tuhlbox" homepage = "https://git.uibk.ac.at/csak8736/tuhlbox"
description = "Personal toolbox of language processing models." description = "Personal toolbox of language processing models."
authors = ["Benjamin Murauer <b.murauer@posteo.de>"] authors = ["Benjamin Murauer <b.murauer@posteo.de>"]
......
from typing import List
from tuhlbox.stanza import StanzaNlpToFieldTransformer, StanzaParserTransformer
sentences: List
def setup_function() -> None:
global sentences
documents = [
"I have been trying to reach you. This is another sentence in this document.",
"This is a second document, independent from the others.",
]
parser = StanzaParserTransformer("en", silent=True)
sentences = parser.transform(documents)
def test_stanza_nlp_to_field_transformer_xpos() -> None:
transformer = StanzaNlpToFieldTransformer("xpos")
expected = [
[ # document 1
["PRP", "VBP", "VBN", "VBG", "TO", "VB", "PRP", "."], # sentence 1
["DT", "VBZ", "DT", "NN", "IN", "DT", "NN", "."], # sentence 2
],
[ # document 2 (one sentence)
["DT", "VBZ", "DT", "JJ", "NN", ",", "JJ", "IN", "DT", "NNS", "."],
],
]
actual = transformer.transform(sentences)
assert expected == actual
def test_stanza_nlp_to_field_transformer_upos() -> None:
transformer = StanzaNlpToFieldTransformer("upos")
expected = [
[ # document 1
["PRON", "AUX", "AUX", "VERB", "PART", "VERB", "PRON", "PUNCT"],
["PRON", "AUX", "DET", "NOUN", "ADP", "DET", "NOUN", "PUNCT"],
],
[ # document 2
[
"PRON",
"AUX",
"DET",
"ADJ",
"NOUN",
"PUNCT",
"ADJ",
"ADP",
"DET",
"NOUN",
"PUNCT",
]
],
]
actual = transformer.transform(sentences)
assert expected == actual
...@@ -106,11 +106,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin): ...@@ -106,11 +106,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
""" """
Flattens a stanford document in the same order as the parsed text. Flattens a stanford document in the same order as the parsed text.
Input (Document | Sentence): Input document:
each document is expected to be a StanfordNLP document or sentence. each document is expected to be a StanfordNLP document.
Output: each document is returned as a single string, where each word is Output document: each document is returned as a list of sentences; and each sentence
represented as the field that is to be extracted, separated by spaces. is a list of tags (=strings)
Sentences are separated by newlines.
""" """
def __init__(self, field: str): def __init__(self, field: str):
...@@ -128,11 +128,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin): ...@@ -128,11 +128,11 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
def transform( def transform(
self, x: Iterable[Union[str, Document, Sentence]], _y: Any = None self, x: Iterable[Union[str, Document, Sentence]], _y: Any = None
) -> List[str]: ) -> List[List[List[str]]]:
"""Transform documents.""" """Transform documents."""
result = [] result: List[List[List[str]]] = []
for document in x: for document in x:
document_result: List[str] = [] document_result: List[List[str]] = []
sentences: List[Sentence] = [] sentences: List[Sentence] = []
if isinstance(document, Sentence): if isinstance(document, Sentence):
sentences = [document] sentences = [document]
...@@ -145,8 +145,8 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin): ...@@ -145,8 +145,8 @@ class StanzaNlpToFieldTransformer(BaseEstimator, TransformerMixin):
if label is None: if label is None:
raise ValueError(f"label is None for word: {word}") raise ValueError(f"label is None for word: {word}")
new_sentence.append(label) new_sentence.append(label)
document_result.append(" ".join(new_sentence)) document_result.append(new_sentence)
result.append("\n".join(document_result)) result.append(document_result)
return result return result
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment