added calculation of first fc layer size

94d5251a · Benjamin Murauer · 57c6c538 · 94d5251a · 57c6c538 · 57c6c538
Commit 94d5251a authored 3 years ago by Benjamin Murauer
--- a/tests/test_torch.py
+++ b/tests/test_torch.py
+"""Test basic skorch models with CNN network."""
+
+import torch
+from dstoolbox.transformers import Padder2d, TextFeaturizer
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.pipeline import make_pipeline
+from torch import nn
+from tuhlbox.torch_classifier import TorchClassifier
+from tuhlbox.torch_cnn import CharCNN, ConvLayerConfig, FcLayerConfig
+from tuhlbox.torch_lstm import RNNClassifier
+
+x, y = fetch_20newsgroups(return_X_y=True)
+VOCAB_SIZE = 1000
+EMB_DIM = 300
+MAX_SEQ_LEN = 100
+
+
+def test_cnn() -> None:
+    pipe = make_pipeline(
+        TextFeaturizer(max_features=VOCAB_SIZE),
+        Padder2d(pad_value=VOCAB_SIZE, max_len=MAX_SEQ_LEN, dtype=int),
+        TorchClassifier(
+            module=CharCNN,
+            max_seq_len=MAX_SEQ_LEN,
+            device="cpu",
+            batch_size=54,
+            max_epochs=5,
+            learn_rate=0.01,
+            optimizer=torch.optim.Adam,
+            model_kwargs=dict(
+                module__emb_layer=nn.Embedding(VOCAB_SIZE + 1, EMB_DIM),
+                module__conv_layer_configs=[
+                    ConvLayerConfig(EMB_DIM, 50, 7, 1, 3, 3),
+                    ConvLayerConfig(50, 50, 5, 1, 3, 3),
+                ],
+                module__fc_layer_configs=[
+                    FcLayerConfig(None, 256),  # will be calculated automagically
+                    FcLayerConfig(256, 128),
+                    FcLayerConfig(128, 64),
+                ],
+            ),
+        ),
+    )
+
+    pipe.fit(x, y)
+
+
+def test_lstm() -> None:
+    pipe = make_pipeline(
+        TextFeaturizer(max_features=VOCAB_SIZE),
+        Padder2d(pad_value=VOCAB_SIZE, max_len=MAX_SEQ_LEN, dtype=int),
+        TorchClassifier(
+            module=RNNClassifier,
+            device="cuda",
+            batch_size=54,
+            max_epochs=5,
+            learn_rate=0.01,
+            optimizer=torch.optim.Adam,
+        ),
+    )
+
+    pipe.fit(x, y)
--- a/tests/test_torch_cnn.py
+++ b/tests/test_torch_cnn.py
-"""Test basic skorch models with CNN network."""
-
-import torch
-from dstoolbox.transformers import Padder2d, TextFeaturizer
-from sklearn.datasets import fetch_20newsgroups
-from sklearn.pipeline import make_pipeline
-from skorch import NeuralNetClassifier
-from tuhlbox.torch_classifier import TorchClassifier
-from tuhlbox.torch_cnn import CharCNN
-
-x, y = fetch_20newsgroups(return_X_y=True)
-
-VOCAB_SIZE = 1000
-EMB_DIM = 300
-MAX_SEQ_LEN = 100
-
-pipe = make_pipeline(
-    TextFeaturizer(max_features=VOCAB_SIZE),
-    Padder2d(pad_value=VOCAB_SIZE, max_len=MAX_SEQ_LEN, dtype=int),
-    TorchClassifier(
-        module=CharCNN,
-        device="cpu",
-        batch_size=54,
-        max_epochs=5,
-        learn_rate=0.01,
-        optimizer=torch.optim.Adam,
-        model_kwargs=dict(
-            module__embedding_dim=EMB_DIM,
-            module__vocab_size=VOCAB_SIZE,
-            module__max_seq_length=MAX_SEQ_LEN,
-            module__conv_layer_configurations=[
-                (0, 54, 7, 1, 3, 3),
-                (54, 50, 5, 1, 10, 1),
-            ],
-            # the first value in this fc config is somehow related to the stride of
-            # the maxpool, unsure of this.
-            module__fc_layer_configurations=[350, 256, 128],
-        ),
-    ),
-)
-
-pipe.fit(x, y)
--- a/tests/test_torch_rnn.py
+++ b/tests/test_torch_rnn.py
-"""Test basic skorch models with CNN network."""
-
-import torch
-from dstoolbox.transformers import Padder2d, TextFeaturizer
-from sklearn.datasets import fetch_20newsgroups
-from sklearn.pipeline import make_pipeline
-from tuhlbox.torch_classifier import TorchClassifier
-from tuhlbox.torch_lstm import RNNClassifier
-
-x, y = fetch_20newsgroups(return_X_y=True)
-
-VOCAB_SIZE = 1000
-EMB_DIM = 300
-MAX_SEQ_LEN = 100
-
-pipe = make_pipeline(
-    TextFeaturizer(max_features=VOCAB_SIZE),
-    Padder2d(pad_value=VOCAB_SIZE, max_len=MAX_SEQ_LEN, dtype=int),
-    TorchClassifier(
-        module=RNNClassifier,
-        device="cuda",
-        batch_size=54,
-        max_epochs=5,
-        learn_rate=0.01,
-        optimizer=torch.optim.Adam,
-    ),
-)
-
-pipe.fit(x, y)
--- a/tuhlbox/torch_classifier.py
+++ b/tuhlbox/torch_classifier.py
@@ -22,6 +22,7 @@ class TorchClassifier(ClassifierMixin, BaseEstimator):
    def __init__(
        self,
        module: Type[nn.Module],
+        max_seq_len: int = None,
        batch_size: int = 64,
        max_epochs: int = 5,
        learn_rate: float = 1e-3,
@@ -38,12 +39,15 @@ class TorchClassifier(ClassifierMixin, BaseEstimator):
        self.wrapped_model: Optional[NeuralNetClassifier] = None
        self.optimizer = optimizer
        self.label_encoder: LabelEncoder = LabelEncoder()
+        self.max_seq_len = max_seq_len

    def fit(self, x: Any, y: Iterable[Any], **fit_kwargs: Any) -> TorchClassifier:
        if self.wrapped_model is None:
            classes = set(y)
            n_classes = len(classes)
            self.model_kwargs["module__n_classes"] = n_classes
+            if self.max_seq_len is not None:
+                self.model_kwargs["module__max_seq_len"] = self.max_seq_len
            self.wrapped_model = NeuralNetClassifier(
                module=self.module,
                device=self.device,

--- a/tuhlbox/torch_cnn.py
+++ b/tuhlbox/torch_cnn.py
 """Basic CNN model."""
+import math
+from collections import namedtuple
 from typing import List, Tuple

 import torch
 import torch.nn as nn

+ConvLayerConfig = namedtuple(
+    "ConvLayerConfig",
+    [
+        "in_channels",
+        "out_channels",
+        "conv_kernel_size",
+        "conv_stride",
+        "max_kernel_size",
+        "max_stride",
+    ],
+)

-def _generate_conv_layers(
-    embedding_dim: int,
-    conv_layer_configurations: List[Tuple[int, int, int, int, int, int]],
-) -> List[nn.Module]:
-    result: List[nn.Module] = []
-    for i, layer in enumerate(conv_layer_configurations):
-        input_size = layer[0] if i > 0 else embedding_dim
+FcLayerConfig = namedtuple("FcLayerConfig", ["in_features", "out_features"])
+
+
+def create_conv_layers(configs: List[ConvLayerConfig]) -> List[nn.Sequential]:
+    result = []
+    for config in configs:
        result.append(
            nn.Sequential(
                nn.Conv1d(
-                    in_channels=input_size,
-                    out_channels=layer[1],
-                    kernel_size=(layer[2],),
-                    stride=(layer[3],),
+                    in_channels=config.in_channels,
+                    out_channels=config.out_channels,
+                    kernel_size=(config.conv_kernel_size,),
+                    stride=(config.conv_stride,),
                ),
                nn.ReLU(),
-                nn.MaxPool1d(kernel_size=layer[4], stride=layer[5]),
-            )
+                nn.MaxPool1d(
+                    kernel_size=(config.max_kernel_size,),
+                    stride=(config.max_stride,),
+                ),
+            ),
        )
    return result


-def _generate_fc_layers(
-    n_classes: int, dropout: float, fc_layer_configurations: List[int]
-) -> List[nn.Module]:
-    result: List[nn.Module] = []
-    last_output_size = 0
-    for i, layer in enumerate(fc_layer_configurations):
-        input_size = layer
-        if i < len(fc_layer_configurations) - 1:
-            last_output_size = fc_layer_configurations[i + 1]
-            print(f"adding fc layer with ({input_size}, {last_output_size})")
-            result.append(
-                nn.Sequential(
-                    nn.Linear(input_size, last_output_size),
-                    nn.ReLU(),
-                    nn.Dropout(p=dropout),
-                )
+def create_fc_layers(
+    fc_configs: List[FcLayerConfig], conv_configs: List[ConvLayerConfig], start_n: int
+) -> List[nn.Sequential]:
+    result = []
+    start_n = compute_first_fc_layer_input_size(conv_configs, start_n)
+    for i, config in enumerate(fc_configs):
+        in_features = start_n if i == 0 else config.in_features
+        result.append(
+            nn.Sequential(
+                nn.Linear(in_features=in_features, out_features=config.out_features)
            )
-    print(f"adding fc layer with ({last_output_size}, {n_classes})")
-    result.append(nn.Linear(last_output_size, n_classes))
-    result.append(nn.LogSoftmax(dim=-1))
+        )
    return result


+def compute_first_fc_layer_input_size(
+    conv_configs: List[ConvLayerConfig], n: int
+) -> int:
+    """
+    Calculates the input dimension of the first fully connected layer.
+
+    See https://datascience.stackexchange.com/a/40991/9281
+
+    Args:
+        conv_configs: Configurations of the convolution layers
+        n: starting value (max sequence length)
+
+    Returns:
+        the dimension of the first fc layer
+    """
+
+    def get_output_dim(in_size: int, kernel: int, stride: int) -> int:
+        return math.floor(((in_size - kernel) / stride) + 1)
+
+    for config in conv_configs:
+        n = get_output_dim(n, config.conv_kernel_size, config.conv_stride)
+        n = get_output_dim(n, config.max_kernel_size, config.max_stride)
+
+    last_conv_out_channels = conv_configs[-1].out_channels
+    return n * last_conv_out_channels
+
+
 class CharCNN(nn.Module):
    """Basic CNN model that can be built with variable amounts of layers etc."""

    def __init__(
        self,
-        vocab_size: int,
-        embedding_dim: int,
        n_classes: int,
-        max_seq_length: int,
-        conv_layer_configurations: List[Tuple[int, int, int, int, int, int]],
-        fc_layer_configurations: List[int],
-        dropout: float = 0.0,
+        max_seq_len: int,
+        emb_layer: nn.Embedding,
+        conv_layer_configs: List[ConvLayerConfig],
+        fc_layer_configs: List[FcLayerConfig],
    ):
-        """
-        Create a new CNN model.
-
-        Args:
-            vocab_size: Size of the vocabulary
-            embedding_dim: Size of embedding vectors
-            n_classes: Number of classes used
-            max_seq_length: Max. sequence length for each token
-            dropout: random dropout fraction
-        """
        super().__init__()
-        self.embedding_dim = embedding_dim
-        self.vocab_size = vocab_size
-        self.dropout = dropout
-        self.max_seq_length = max_seq_length
-        self.n_classes = n_classes
-        self.conv_layer_configurations = conv_layer_configurations
-        self.fc_layer_configurations = fc_layer_configurations
-
-        self.emb = nn.Embedding(
-            num_embeddings=self.vocab_size + 1,
-            embedding_dim=self.embedding_dim,
-        )
-
-        self.conv_layers = _generate_conv_layers(
-            self.embedding_dim, self.conv_layer_configurations
+        self.emb_layer = emb_layer
+        self.conv_layers = create_conv_layers(conv_layer_configs)
+        self.fc_layers = create_fc_layers(
+            fc_layer_configs, conv_layer_configs, max_seq_len
        )
-        self.fc_layers = _generate_fc_layers(
-            self.n_classes, self.dropout, self.fc_layer_configurations
+        self.last_layer = nn.Sequential(
+            nn.Linear(fc_layer_configs[-1].out_features, n_classes),
+            nn.LogSoftmax(dim=1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.emb(x).permute(0, 2, 1)
+        # the embedding layer returns the values in a different order than is required
+        # by the convolution layers, so we have to swap them
+        x = self.emb_layer(x).permute(0, 2, 1)
        for conv in self.conv_layers:
            x = conv(x)
-        print(x.size())
+        # flatten all values
        x = x.view(x.size(0), -1)
-        print(x.size())
        for fc in self.fc_layers:
            x = fc(x)
-        return x
+        return self.last_layer(x)
+
+    @staticmethod
+    def conv_layer(
+        in_channels: int,
+        out_channels: int,
+        conv_kernel: int,
+        conv_stride: int = 1,
+        max_kernel: int = 3,
+        max_stride: int = 3,
+    ) -> nn.Module:
+        return nn.Sequential(
+            nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(conv_kernel,),
+                stride=(conv_stride,),
+            ),
+            nn.ReLU(),
+            nn.MaxPool1d(max_kernel, max_stride),
+        )
+
+    @staticmethod
+    def fc_layer(
+        in_size: int,
+        out_size: int,
+        dropout: float = 0.0,
+    ) -> nn.Module:
+        return nn.Sequential(
+            nn.Linear(in_features=in_size, out_features=out_size),
+            nn.ReLU(),
+            nn.Dropout(p=dropout),
+        )