fix(preprocessor): add multimodal arc ref

kengz · kengz · commit 70ee2601bed9 · 2021-11-18T07:37:03.000-05:00
diff --git a/test/module/perceiver_io/test_preprocessor.py b/test/module/perceiver_io/test_preprocessor.py
@@ -50,7 +50,7 @@ def test_text_preprocessor(batch, in_shape, num_freq_bands):
         'vector': {
             'type': 'FourierPreprocessor',
             'num_freq_bands': 16,
-            'max_reso': [32],
+            'max_reso': [31],
             'cat_pos': True,
         },
     }
diff --git a/torcharc/arc_ref.py b/torcharc/arc_ref.py
@@ -231,6 +231,56 @@
             }
         }
     },
+    'perceiver_multimodal2classifier': {
+        'type': 'Perceiver',
+        'in_shape': {'image': [224, 224, 3], 'vector': [31, 2]},
+        'arc': {
+            'preprocessor': {
+                'type': 'MultimodalPreprocessor',
+                'arc': {
+                    'image': {
+                        'type': 'FourierPreprocessor',
+                        'num_freq_bands': 64,
+                        'max_reso': [224, 224],
+                        'cat_pos': True,
+                    },
+                    'vector': {
+                        'type': 'FourierPreprocessor',
+                        'num_freq_bands': 16,
+                        'max_reso': [31],
+                        'cat_pos': True,
+                    },
+                },
+                'pad_channels': 2,
+            },
+            'encoder': {
+                'type': 'PerceiverEncoder',
+                'latent_shape': [2048, 1024],
+                'head_dim': 1024,  # usually preserves latent_shape[-1]
+                'v_head_dim': None,  # defaults to head_dim
+                'cross_attn_num_heads': 1,
+                'cross_attn_widening_factor': 1,
+                'num_self_attn_blocks': 8,
+                'num_self_attn_per_block': 6,
+                'self_attn_num_heads': 8,
+                'self_attn_widening_factor': 1,
+                'dropout_p': 0.0,
+            },
+            'decoder': {
+                'type': 'PerceiverDecoder',
+                'out_shape': [1, 1024],
+                'head_dim': 1024,  # usually preserves out_shape[-1]
+                'v_head_dim': None,  # defaults to head_dim
+                'cross_attn_num_heads': 1,
+                'cross_attn_widening_factor': 1,
+                'dropout_p': 0.0,
+            },
+            'postprocessor': {
+                'type': 'ClassificationPostprocessor',
+                'out_dim': 10,
+            }
+        }
+    },
     # DAGs
     'forward': {
         'dag_in_shape': [8],
diff --git a/torcharc/module/perceiver_io/preprocessor.py b/torcharc/module/perceiver_io/preprocessor.py
@@ -85,7 +85,7 @@ def build_pos_encoding(self, pos: torch.Tensor, max_reso: list = None) -> torch.
         @return position encodings tensor of shape (x, y,... d*(2*num_freq_bands+1))
         '''
         max_reso = max_reso or pos.shape[:-1]
-        assert len(max_reso) == len(pos.shape[:-1]), f'max_reso len(shape) must match pos len(shape), but got {len(max_reso)} != {len(pos.shape[:-1])}'
+        assert len(max_reso) == len(pos.shape[:-1]), f'max_reso len(shape) must match pos len(shape), but got {len(max_reso)} instead of {len(pos.shape[:-1])}'
         freq_bands = torch.stack([torch.linspace(1.0, max_r / 2.0, steps=self.num_freq_bands) for max_r in max_reso])
         pos_freqs = rearrange(torch.einsum('...d,df->d...f', pos, freq_bands), 'd ... f -> ... (d f)')
 

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def test_text_preprocessor(batch, in_shape, num_freq_bands):`
`50`	`50`	`'vector': {`
`51`	`51`	`'type': 'FourierPreprocessor',`
`52`	`52`	`'num_freq_bands': 16,`
`53`		`- 'max_reso': [32],`
	`53`	`+ 'max_reso': [31],`
`54`	`54`	`'cat_pos': True,`
`55`	`55`	`},`
`56`	`56`	`}`