Commit 4fb99a2b authored by Benjamin Murauer's avatar Benjamin Murauer
Browse files

first commit

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
requirements-dev.txt
.idea
BSD 2-Clause License
Copyright (c) 2019, DBIS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This diff is collapsed.
[tool.poetry]
name = "tuhlbox"
version = "0.1.0"
description = ""
authors = ["Benjamin Murauer <b.murauer@gmail.com>"]
[tool.poetry.dependencies]
python = "^3.9"
sklearn = "^0.0"
treegrams = "^0.1.0"
gensim = "^3.8.3"
dstoolbox = "^0.10.1"
[tool.poetry.dev-dependencies]
pytest = "^5.2"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
from nltk import word_tokenize
from tuhlbox.doc2vec import Doc2VecTransformer
def test_transformation():
documents = [
word_tokenize("This is an example sentence."),
word_tokenize("This is a second piece of text."),
]
transformer = Doc2VecTransformer(vector_size=123)
actual = transformer.fit_transform(documents)
assert len(actual) == len(documents)
assert len(actual[0]) == 123
assert len(actual[1]) == 123
import itertools
import unittest
import numpy as np
from tuhlbox.life import LifeVectorizer
class TestLife(unittest.TestCase):
def test_short_fragment(self):
fragment_sizes = [1000] # larger than text
text = [str(x) for x in range(100)]
transformer = LifeVectorizer(fragment_sizes, 1, "fragment", force=True)
actual = transformer.transform([text])[0].tolist()
predicted = [100.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
self.assertEqual(actual, predicted)
def test_short_bow(self):
fragment_sizes = [1000] # larger than text
text = [str(x) for x in range(100)]
transformer = LifeVectorizer(fragment_sizes, 1, "bow", force=True)
actual = transformer.transform([text])[0].tolist()
predicted = [100.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
self.assertEqual(actual, predicted)
def test_short_both(self):
fragment_sizes = [1000] # larger than text
text = [str(x) for x in range(100)]
transformer = LifeVectorizer(fragment_sizes, 1, "both", force=True)
actual = transformer.transform([text])[0].tolist()
predicted = [
100.0,
100.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
100.0,
100.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
self.assertEqual(actual, predicted)
def test_short_document_exceptions(self):
text = [str(x) for x in range(100)]
for m in ["fragment", "bow", "both"]:
transformer = LifeVectorizer([1000], 1, m, force=False)
self.assertRaises(ValueError, transformer.transform, text)
def test_random_shape_sizes(self):
n_sizes = np.random.randint(1, 10)
fragment_sizes = np.random.randint(2, 50, size=n_sizes)
text = [str(x) for x in range(100)]
vec1 = LifeVectorizer(fragment_sizes, 1, "fragment")
vec2 = LifeVectorizer(fragment_sizes, 1, "bow")
vec3 = LifeVectorizer(fragment_sizes, 1, "both")
self.assertEqual(vec1.transform(text)[0].shape, (n_sizes * 8,))
self.assertEqual(vec2.transform(text)[0].shape, (n_sizes * 8,))
self.assertEqual(vec3.transform(text)[0].shape, (n_sizes * 16,))
def test_life_single_fragment(self):
transformer = LifeVectorizer([42], 50, "fragment")
text = [str(x) for x in range(100)]
actual = transformer.transform([text])[0].tolist()
predicted = [42.0, 42.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
self.assertEqual(actual, predicted)
def test_life_single_bow(self):
transformer = LifeVectorizer([42], 100, "bow")
text = [str(x) for x in range(100)]
actual = transformer.transform([text])[0].tolist()
predicted = [42.0, 42.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
self.assertEqual(actual, predicted)
def test_life_single_bfs(self):
transformer = LifeVectorizer([42], 100, "both")
text = [str(x) for x in range(100)]
actual = transformer.transform([text])[0].tolist()
predicted = [
42.0,
42.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
42.0,
42.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
self.assertEqual(actual, predicted)
def test_life_double_fragment(self):
transformer = LifeVectorizer([42, 41], 50, "fragment")
text = [str(x) for x in range(100)]
actual = transformer.transform([text])[0].tolist()
predicted = [
42.0,
42.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
41.0,
41.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
self.assertEqual(actual, predicted)
def test_life_double_bow(self):
transformer = LifeVectorizer([42, 41], 100, "bow")
text = [str(x) for x in range(100)]
actual = transformer.transform([text])[0].tolist()
predicted = [
42.0,
42.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
41.0,
41.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
self.assertEqual(actual, predicted)
def test_life_double_bfs(self):
rand_1 = np.random.randint(2, 100)
rand_2 = np.random.randint(2, 100)
transformer = LifeVectorizer([rand_1, rand_2], 100, "both")
text = [str(x) for x in range(100)]
actual = transformer.transform([text])[0].tolist()
predicted = [
float(rand_1),
float(rand_1),
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
float(rand_1),
float(rand_1),
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
float(rand_2),
float(rand_2),
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
float(rand_2),
float(rand_2),
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
self.assertEqual(actual, predicted)
def test_life_half_fragment(self):
frag_size = np.random.randint(2, 100)
transformer = LifeVectorizer([frag_size], 100, "fragment")
# this text consists of pairs of two identical words
pairs = [(y, y) for y in range(100)]
text = list(itertools.chain.from_iterable(pairs))
actual = transformer.transform([text])[0].tolist()
vocabulary_size = actual[0]
freq_1_count = actual[1]
freq_4_count = actual[2]
freq_10_count = actual[3]
self.assertTrue(vocabulary_size <= frag_size / 2 + 1)
self.assertTrue(vocabulary_size >= frag_size / 2)
# in the most extreme case, no samples have a hapax legomena
self.assertTrue(freq_1_count >= 0)
# in the most extreme case, all samples have 2 hapax legomena
self.assertTrue(freq_1_count <= 2)
# in the most extreme case, all samples have 2 hapax legomena and
# (frag_size-2)/2 dis legomena
self.assertTrue(freq_4_count >= frag_size / 2 - 1)
# in the most extreme case, no samples have hapax legomena and
# frag_size/2 dis legomena
self.assertTrue(freq_4_count <= frag_size / 2)
# no sample should have word occurring more often than twice
self.assertAlmostEqual(freq_10_count, 0.0)
import numpy as np
from numpy.testing import assert_array_equal
from tuhlbox.stringkernels import (
intersection_kernel,
presence_kernel,
spectrum_kernel,
)
docs = [
"I like this old movie. The movie is very nice.",
"In my opinion the book tells a very nice story. I really like it.",
"I wonder if you could drink this juice. It tastes so bad. Isn’t it bad?",
"Your dish is too spicy. You must be a such bad cook. "
"Don’t worry, I am as bad as you.",
]
ngram_min, ngram_max = 1, 4
def test_intersection_kernel():
# obtained from java ComputeStringKernel intersection 1 4 sentences.txt <outfile> # noqa E501
expected = np.array(
[[178, 95, 66, 49], [95, 254, 72, 72], [66, 72, 278, 112],
[49, 72, 112, 334]],
dtype=int,
)
assert_array_equal(expected,
intersection_kernel(docs, docs, ngram_min, ngram_max))
def test_presence_kernel():
# obtained from java ComputeStringKernel presence 1 4 sentences.txt <outfile> # noqa E501
expected = np.array(
[[128, 67, 42, 29], [67, 197, 38, 42], [42, 38, 209, 64],
[29, 42, 64, 235]],
dtype=int,
)
assert_array_equal(expected,
presence_kernel(docs, docs, ngram_min, ngram_max))
def test_spectrum_kernel():
# obtained from java ComputeStringKernel spectrum 1 4 sentences.txt <outfile> # noqa E501
expected = np.array(
[
[390, 335, 300, 313],
[335, 598, 393, 458],
[300, 393, 680, 585],
[313, 458, 585, 1006],
],
dtype=int,
)
assert_array_equal(expected,
spectrum_kernel(docs, docs, ngram_min, ngram_max))
from tuhlbox.subfreq import SubFrequencyVectorizer
def test_subset_vectorizer():
transformer = SubFrequencyVectorizer()
texts = [
'male boy man girly ',
'female girly woman boy boy woman',
'super duper super man ',
'man manly boy man',
]
targets = [0, 1, 2]
actual = transformer.fit_transform(texts, targets)
assert actual.shape == (4, 3) # 4 samples, 3 classes
import unittest
import nltk
from tuhlbox.tree import (
StringToTreeTransformer,
TreeChainTransformer,
WordToPosTransformer,
)
class TestTrees(unittest.TestCase):
def test_tree_parsing(self):
transformer = StringToTreeTransformer()
# one document could contain many sentences, and each sentence is a
# tree.
documents = [["(S (NP I) (VP (V saw) (NP him)))"]]
expected = [[nltk.Tree.fromstring(documents[0][0])]]
actual = transformer.transform(documents)
self.assertEqual(actual, expected)
def test_pos_extraction(self):
transformer = WordToPosTransformer()
documents = [
[nltk.Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")]]
expected = [["NP", "V", "NP"]]
actual = transformer.transform(documents)
self.assertEqual(actual, expected)
def test_pos_short_chains(self):
transformer = TreeChainTransformer(max_length=2,
combine_chain_elements=" ")
documents = [
# document 1
[
# tree 1
nltk.Tree.fromstring(
"(S (DP (D the) (NP dog)) (VP (V chased) (DP (D the) (NP "
"cat))))"
)
],
# document 2
[
# tree 2
nltk.Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))"),
# tree 3
nltk.Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))"),
],
]
expected = [
# document 1
[
# tree 1
["S DP", "DP D", "DP NP", "S VP", "VP V", "VP DP", "DP D",
"DP NP"]
],
# document 2
[
# tree 2
["S NP", "S VP", "VP V", "VP NP"],
# tree 3
["S NP", "S VP", "VP V", "VP NP"],
],
]
actual = list(transformer.transform(documents))
self.assertEqual(actual, expected)
def test_pos_short_chains_nocombine(self):
transformer = TreeChainTransformer(max_length=2,
combine_chain_elements=None)
documents = [
# document 1
[
# tree 1
nltk.Tree.fromstring(
"(S (DP (D the) (NP dog)) (VP (V chased) (DP (D the) (NP "
"cat))))"
)
],
# document 2
[
# tree 2
nltk.Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))"),
# tree 3
nltk.Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))"),
],
]
expected = [
# document 1
[
# tree 1
[
["S", "DP"],
["DP", "D"],
["DP", "NP"],
["S", "VP"],
["VP", "V"],
["VP", "DP"],
["DP", "D"],
["DP", "NP"],
]
],
# document 2
[
# tree 2
[["S", "NP"], ["S", "VP"], ["VP", "V"], ["VP", "NP"]],
# tree 3
[["S", "NP"], ["S", "VP"], ["VP", "V"], ["VP", "NP"]],
],
]
actual = list(transformer.transform(documents))
self.assertEqual(actual, expected)
def test_pos_long_chains(self):
transformer = TreeChainTransformer(max_length=None,
combine_chain_elements=" ")
documents = [
# document 1
[
# tree 1
nltk.Tree.fromstring(
"(S (DP (D the) (NP dog)) (VP (V chased) (DP (D the) (NP "
"cat))))"
)
],
# document 2
[
# tree 2
nltk.Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
],
]
expected = [
# document 1
[
# tree 1
["S DP D", "S DP NP", "S VP V", "S VP DP D", "S VP DP NP"]
],
# document 2
[
# tree 2
["S NP", "S VP V", "S VP NP"]
],
]
actual = list(transformer.transform(documents))
self.assertEqual(actual, expected)
<