ddnet.py

from datetime import datetime as dt
import torch
import torch.nn as nn
import numpy as np
from copy import deepcopy
from functools import cached_property
import warnings
from collections import OrderedDict
import os

import disjoint_domain as dd
import util

net_defaults = {
    'n_domains': 4, 'ctx_per_domain': 4,
    'attrs_per_context': 60, 'attrs_set_per_item': 25, 'padding_attrs': 0,
    'use_item_repr': True, 'item_repr_units': 16, 'merged_repr': False,
    'use_ctx': True, 'share_ctx': False, 'use_ctx_repr': True, 'ctx_repr_units': 16,
    'hidden_units': 32,
    'share_attr_units_in_domain': False, 'repeat_attrs_over_domains': False,
    'cluster_info': '4-2-2', 'last_domain_cluster_info': None,
    'param_init_type': 'normal', 'param_init_scale': 0.01,
    'fix_biases': False, 'fixed_bias': -2,
    'act_fn': torch.sigmoid, 'output_act_fn': None, 'loss_fn': nn.BCELoss,
    'include_cross_domain_loss': True,
    'rng_seed': None, 'torchfp': None, 'device': None,
    'verbose': True
}

train_defaults = {
    'lr': 0.01,
    'num_epochs': 3000,
    'batch_size': 16,
    'report_freq': 50,
    'snap_freq': 50,
    'snap_freq_scale': 'lin',
    'scheduler': None,
    'holdout_testing': 'none',
    'domains_to_hold_out': 0,
    'train_held_out_only': False,
    'reports_per_test': 4,
    'test_criterion': 'weighted_acc_loose',
    'test_thresh': 0.97,
    'test_max_epochs': 10000,
    'do_combo_testing': False,
    'n_combo_per_domain': 1,
    'param_snapshots': True,
    'include_final_eval': True
}

# List of net params that are generated by calling them on initialization.
# These shoudld be saved/restored separately to avoid being different in the restored network.
callable_net_params = ['cluster_info', 'last_domain_cluster_info']


class DisjointDomainNet(nn.Module):
    """
    Network for disjoint domain learning as depicted in Figure R4.
    
    Constructor keyword arguments:
    - attrs_set_per_item: How many of the ground-truth output attributes are set for each item/context pair
    - use_item_repr: False to skip item representation layer, pass items directly to hidden layer
    - item_repr_units: Size of item representation layer (unless merged_repr is True or use_item_repr is False)
    - use_ctx_repr: False to skip context representation layer, pass contexts directly to hidden layer
    - ctx_repr_units: Size of context representation layer (unless merged_repr is True or use_ctx_repr is False)
    - merged_repr: Use a single representation layer for items and contexts, of size item_repr_units + ctx_repr_units
    - hidden_units: Size of (final) hidden layer
    - use_ctx: False to not have any context inputs at all (probably best with ctx_per_domain=1)
    - share_ctx: True to use one set of context inputs for all domains instead of separate ones
    - share_attr_units_in_domain: True to use the same attr units for each context within each domain.
    - cluster_info: String or dict specifying item similarity structure, etc. - see dd.make_attr_vecs()
    - last_domain_cluster_info: If not None, possibly different cluster_info for the last domain
      - NEW 5/18: If this is a tuple of length N, replaces the final N domains according to the given cluster info.
    - param_init_type: How to initialize weights and biases - 'default' (PyTorch default), 'normal' or 'uniform'
    - param_init_scale: If param_init_type != 'default', std of normal distribution or 1/2 width of uniform distribution
    - fix_biases: If True, don't use trainable biases
    - fixed_bias: Only if fix_biases is True, use this value as the fixed bias (set to 0 for no biases)
    - repeat_attrs_over_domains: Whether to reuse the exact same ground truth attributes, shifted, for each domain
    - activation_fn: Function to use as nonlinearity for all layers
    - output_activation: If None, use same as activation_fn
    - loss_fn: Type of loss function to use (will be initialized with reduction='sum')
    - include_cross_domain_loss: If False, mask outputs when computing loss so that
                                 the gradient does not take into account each item's
                                 outputs onto attributes of other domains.
    - rng_seed: Seed for the PyTorch RNG
    - torchfp: Override floating-point class to use for weights & activations
    - device: Override device (torch.device('cuda') or torch.device('cpu'))
    """

    def gen_training_tensors(self):
        """Make PyTorch x and y tensors for training DisjointDomainNet"""

        item_mat, context_mat, attr_mat = dd.make_io_mats(
            ctx_per_domain=self.ctx_per_domain, attrs_per_context=self.attrs_per_context,
            attrs_set_per_item=self.attrs_set_per_item,
            n_domains=self.n_domains, cluster_info=self.cluster_info, 
            last_domain_cluster_info=self.last_domain_cluster_info,
            repeat_attrs_over_domains=self.repeat_attrs_over_domains,
            share_ctx=self.share_ctx,
            share_attr_units_in_domain=self.share_attr_units_in_domain,
            padding_attrs=self.padding_attrs
        )

        x_item = torch.tensor(item_mat, dtype=self.torchfp, device=self.device)
        x_context = torch.tensor(context_mat, dtype=self.torchfp, device=self.device)
        y = torch.tensor(attr_mat, dtype=self.torchfp, device=self.device)
        
        y_domain_mask = torch.block_diag(*[
            torch.ones([s//self.n_domains for s in y.shape],
                       dtype=self.torchfp, device=self.device)
            for _ in range(self.n_domains)])

        return x_item, x_context, y, y_domain_mask

    def __init__(self, **net_params):
        super(DisjointDomainNet, self).__init__()

        # Merge default params with overrides and make them properties        
        net_params = {**net_defaults, **net_params}
        for key, val in net_params.items():
            setattr(self, key, val)
        
        # Make sure to do this *before* RNG is possibly used in callable params (e.g. cluster info with permutations)
        # although tbh it would be silly to rely on the seed to reproduce anything long-term
        if self.rng_seed is None:
            self.rng_seed = torch.seed()
        else:
            torch.manual_seed(self.rng_seed)
                
        for key in callable_net_params:
            val = getattr(self, key)
            if callable(val):
                setattr(self, key, val)
            
        self.device, self.torchfp, _ = util.init_torch(self.device, self.torchfp)
        if self.verbose:
            if self.device.type == 'cuda':
                print('Using CUDA')
            else:
                print('Using CPU')

        self.use_ctx_repr = self.use_ctx and self.use_ctx_repr
        if self.merged_repr:
            assert self.use_item_repr and self.use_ctx_repr, "Can't both skip and merge repr layers"
            
        # make sure we don't unnecessarily repeat inputs if not using contexts
        if not self.use_ctx:
            self.ctx_per_domain = 1

        if self.share_ctx:
            self.n_contexts = self.ctx_per_domain
        else:
            self.n_contexts = self.ctx_per_domain * self.n_domains

        self.n_items = dd.ITEMS_PER_DOMAIN * self.n_domains

        self.n_attributes = self.attrs_per_context * self.n_domains
        if not self.share_attr_units_in_domain:
            self.n_attributes *= self.ctx_per_domain

        if self.output_act_fn is None:
            self.output_act_fn = self.act_fn
        self.criterion = self.loss_fn(reduction='sum')
        
        self.dummy_item = torch.zeros((1, self.n_items), device=self.device)
        self.dummy_ctx = torch.zeros((1, self.n_contexts), device=self.device)

        if not self.use_item_repr:
            self.item_repr_units = self.n_items

        if not self.use_ctx_repr:
            self.ctx_repr_units = self.n_contexts
        
        self.repr_units = self.item_repr_units + self.ctx_repr_units
        if self.merged_repr:
            # inputs should map to full repr layer
            self.item_repr_units = self.repr_units
            self.ctx_repr_units = self.repr_units
        
        def make_bias(n_units):
            """Make bias for a layer, either a constant or trainable parameter"""
            if self.fix_biases:
                return torch.full((n_units,), self.fixed_bias, device=self.device)
            else:
                return nn.Parameter(torch.empty((n_units,), device=self.device))

        def make_layer(in_size, out_size):
            weights = nn.Linear(in_size, out_size, bias=False).to(self.device)
            biases = make_bias(out_size)
            return weights, biases

        # define layers
        if self.use_item_repr:
            self.item_to_rep, self.item_rep_bias = make_layer(self.n_items, self.item_repr_units)
        else:
            self.item_to_rep = nn.Identity()
            self.item_rep_bias = torch.zeros((self.n_items,), device=self.device)
        
        if self.use_ctx:
            if self.use_ctx_repr:
                self.ctx_to_rep, self.ctx_rep_bias = make_layer(self.n_contexts, self.ctx_repr_units)
            else:
                self.ctx_to_rep = nn.Identity()
                self.ctx_rep_bias = torch.zeros((self.n_contexts,), device=self.device)
        else:
            # replace with dummies
            def make_dummy_ctx_rep(context):
                return torch.zeros((context.shape[0], self.ctx_repr_units), device=self.device)
            self.ctx_to_rep = make_dummy_ctx_rep
            self.ctx_rep_bias = torch.zeros((self.ctx_repr_units,), device=self.device)
        
        self.rep_to_hidden, self.hidden_bias = make_layer(self.repr_units, self.hidden_units)
        self.hidden_to_attr, self.attr_bias = make_layer(self.hidden_units, self.n_attributes)

        # make weights start small
        if self.param_init_type != 'default':
            with torch.no_grad():
                for p in self.parameters():
                    if self.param_init_type == 'normal':
                        nn.init.normal_(p.data, std=self.param_init_scale)
                    elif self.param_init_type == 'uniform':
                        nn.init.uniform_(p.data, a=-self.param_init_scale, b=self.param_init_scale)
                    else:
                        raise ValueError('Unrecognized param init type')

        # make some data
        self.x_item, self.x_context, self.y, self.y_domain_mask = self.gen_training_tensors()
        self.n_inputs = len(self.y)

        # individual item/context tensors for evaluating the network
        self.items, self.item_names = dd.get_items(
            n_domains=self.n_domains, cluster_info=self.cluster_info,
            last_domain_cluster_info=self.last_domain_cluster_info, device=self.device)
        self.contexts, self.context_names = dd.get_contexts(
            n_domains=self.n_domains, ctx_per_domain=self.ctx_per_domain,
            share_ctx=self.share_ctx, device=self.device)
        
        self.train_x_inds = None  # to be set when training occurs
        
    # Don't want to use __eq__ and break hashability. This is a "loose" equality - does not imply identical hashes.
    def equals(self, other):
        if not isinstance(other, DisjointDomainNet):
            return False
        
        # all weights and biases must be the same
        for pname, pval in self.named_parameters():
            try:
                if not pval.equal(other.get_parameter(pname)):
                    return False
            except AttributeError:
                return False
            
        for pname, _ in other.named_parameters():
            try:
                self.get_parameter(pname)
            except AttributeError:
                return False
        
        # all net params must be the same
        for key in net_defaults:
            comparable_val = util.convert_to_sane_eq_type(getattr(self, key))
            if comparable_val != getattr(other, key):
                return False
        
        if not all([
            self.repr_units == other.repr_units,
            self.x_item.equal(other.x_item),
            self.x_context.equal(other.x_context),
            self.y.equal(other.y),
            self.items.equal(other.items),
            self.item_names == other.item_names,
            self.contexts.equal(other.contexts),
            self.context_names == other.context_names,
            np.array_equal(self.train_x_inds, other.train_x_inds)
        ]):
            return False
        
        return True
    
    @cached_property
    def train_item_inds(self):
        if self.train_x_inds is None:
            return None
        
        with torch.no_grad():
            return np.flatnonzero(self.x_item[self.train_x_inds].any(dim=0).cpu().numpy())
            
    @cached_property
    def train_ctx_inds(self):
        if self.train_x_inds is None:
            return None
        
        with torch.no_grad():
            return np.flatnonzero(self.x_context[self.train_x_inds].any(dim=0).cpu().numpy())
        
    #--- Feedforward computation methods ---#

    def calc_item_repr_preact(self, item):
        assert self.use_item_repr, 'No item representation to calculate'
        return self.item_to_rep(item) + self.item_rep_bias
    
    def calc_context_repr_preact(self, context):
        assert self.use_ctx_repr, 'No context representation to calculate'
        return self.ctx_to_rep(context) + self.ctx_rep_bias

    def calc_hidden_preact(self, item=None, context=None):
        if item is None:
            item = self.dummy_item.expand((context.shape[0] if context is not None else 1), -1)
        if context is None:
            context = self.dummy_ctx.expand(item.shape[0], -1)
            
        irep = self.item_to_rep(item) + self.item_rep_bias
        crep = self.ctx_to_rep(context) + self.ctx_rep_bias

        if self.merged_repr:
            rep = irep + crep
        else:
            rep = torch.cat((irep, crep), dim=1)
        rep = self.act_fn(rep)
        return self.rep_to_hidden(rep) + self.hidden_bias

    def calc_attr_preact(self, item, context):
        hidden = self.act_fn(self.calc_hidden_preact(item, context))
        return self.hidden_to_attr(hidden) + self.attr_bias
    
    def forward(self, item, context):
        return self.output_act_fn(self.calc_attr_preact(item, context))
    
    #--- Training and evaluation methods ---#

    def b_outputs_correct(self, outputs, batch_inds, domain_mask=False):
        """Element-wise function to find which outputs are correct for a batch"""
        if domain_mask:
            outputs = outputs * self.y_domain_mask[batch_inds]
            targets = self.y[batch_inds] * self.y_domain_mask[batch_inds]
        else:
            targets = self.y[batch_inds]
        
        return torch.lt(torch.abs(outputs - targets), 0.1).to(self.torchfp)
    
    def weighted_acc(self, outputs, batch_inds, domain_mask=False):
        """
        For each item in the batch, find the average of accuracy for 0s and accuracy for 1s
        (i.e. correct for unbalanced ground truth output)
        """        
        set_attrs_per_item = torch.sum(self.y[batch_inds] > 0, dim=1, keepdim=True)
        set_weight = 0.5 / set_attrs_per_item
        unset_weight = 0.5 / (self.n_attributes - set_attrs_per_item)
        
        weights = torch.where(self.y[batch_inds].to(bool), set_weight, unset_weight)
        b_correct = self.b_outputs_correct(outputs, batch_inds, domain_mask=domain_mask)
        return torch.sum(weights * b_correct, dim=1)
    
    def weighted_acc_loose(self, outputs, batch_inds, domain_mask=False):
        outputs_binary = (outputs > 0.5).to(self.torchfp)
        return self.weighted_acc(outputs_binary, batch_inds, domain_mask=domain_mask)

    def evaluate_input_set(self, input_inds, all_masked=False):
        """Get the loss, accuracy, weighted accuracy, etc. on a set of inputs (e.g. train or test)"""
        self.eval()
        results = {}
        
        with torch.no_grad():
            outputs = self(self.x_item[input_inds], self.x_context[input_inds])
            if self.include_cross_domain_loss:
                results['loss'] = self.criterion(outputs, self.y[input_inds]) / len(input_inds)
            else:
                masked_outputs = outputs * self.y_domain_mask[input_inds]
                masked_targets = self.y[input_inds] * self.y_domain_mask[input_inds]
                results['loss'] = self.criterion(masked_outputs, masked_targets) / len(input_inds)
                
            results['accuracy'] = torch.mean(self.b_outputs_correct(outputs, input_inds)).item()
            results['weighted_acc'] = torch.mean(self.weighted_acc(outputs, input_inds, all_masked)).item()
            results['weighted_acc_loose'] = torch.mean(self.weighted_acc_loose(outputs, input_inds, all_masked)).item()
            
            if not all_masked:
                results['weighted_acc_loose_indomain'] = torch.mean(self.weighted_acc_loose(outputs, input_inds, True)).item()
        return results

    
    def train_epoch(self, order, batch_size, optimizer):
        """Do training on batches of given size of the examples indexed by order."""
        if type(order) != torch.Tensor:
            order = torch.tensor(order, device='cpu', dtype=torch.long)
        
        self.train()
        for batch_inds in torch.split(order, batch_size) if batch_size > 0 else [order]:
            optimizer.zero_grad()
            outputs = self(self.x_item[batch_inds], self.x_context[batch_inds])
            
            if not self.include_cross_domain_loss:
                outputs = outputs * self.y_domain_mask[batch_inds]
            
            loss = self.criterion(outputs, self.y[batch_inds])
            loss.backward()
            optimizer.step()
            
    def generalize_test(self, batch_size, optimizer, included_inds, targets, test_crit, max_epochs=2000, thresh=0.99):
        """
        See how long it takes the network to reach accuracy threshold on target inputs,
        when training on items specified by included_inds. Then restore the parameters.
        
        'targets' can be an array of indices or a logical mask into the full set of inputs.
        """
        self.train()

        # Save original state of network to restore later
        net_state_dict = deepcopy(self.state_dict())
        optim_state_dict = deepcopy(optimizer.state_dict())

        epochs = 0
        while epochs < max_epochs:
            order = util.permute(included_inds)
            self.train_epoch(order, batch_size, optimizer)
            target_stats = self.evaluate_input_set(targets, all_masked=not self.include_cross_domain_loss)
            target_crit = target_stats[test_crit]

            if target_crit >= thresh:
                break

            epochs += 1

        # Restore old state of network
        self.load_state_dict(net_state_dict)
        optimizer.load_state_dict(optim_state_dict)

        etg_string = '= ' + str(epochs + 1) if epochs < max_epochs else '> ' + str(max_epochs)
        return epochs, etg_string
    
    def take_snapshots(self):
        """
        Return a dict of the activations at each layer, before and after nonlinearity, both unreduced over all inputs
        and averaged over all inputs with each item and context present.
        """
        def get_snap_means(unreduced_snapshot):
            item_mean = torch.full((self.n_items, unreduced_snapshot.shape[1]), np.nan)
            ctx_mean = torch.full((self.n_contexts, unreduced_snapshot.shape[1]), np.nan)
            
            for k_item in self.train_item_inds:
                item_mean[k_item] = torch.mean(unreduced_snapshot[self.x_item[:, k_item] > 0], dim=0)
            
            for k_ctx in self.train_ctx_inds:
                ctx_mean[k_ctx] = torch.mean(unreduced_snapshot[self.x_context[:, k_ctx] > 0], dim=0)
            
            return item_mean, ctx_mean
        
        repr_snaps = {}
        snaps = {}
        self.eval()
        with torch.no_grad():
            if self.use_item_repr:
                repr_snaps['item_preact'] = torch.full((self.n_items, self.item_repr_units), np.nan)
                repr_snaps['item'] = repr_snaps['item_preact'].clone()
                item_preact = self.calc_item_repr_preact(self.items[self.train_item_inds])
                repr_snaps['item_preact'][self.train_item_inds] = item_preact
                repr_snaps['item'][self.train_item_inds] = self.act_fn(item_preact)
                
            if self.use_ctx_repr:
                repr_snaps['context_preact'] = torch.full((self.n_contexts, self.ctx_repr_units), np.nan)
                repr_snaps['context'] = repr_snaps['context_preact'].clone()
                ctx_preact = self.calc_context_repr_preact(self.contexts[self.train_ctx_inds])
                repr_snaps['context_preact'][self.train_ctx_inds] = ctx_preact
                repr_snaps['context'][self.train_ctx_inds] = self.act_fn(ctx_preact)
            
            # get the rest of the layers for all inputs
            snaps['hidden_preact'] = torch.full((self.n_inputs, self.hidden_units), np.nan)
            snaps['hidden'] = snaps['hidden_preact'].clone()
            hidden_preact = self.calc_hidden_preact(self.x_item[self.train_x_inds], self.x_context[self.train_x_inds])
            snaps['hidden_preact'][self.train_x_inds] = hidden_preact
            snaps['hidden'][self.train_x_inds] = self.act_fn(hidden_preact)
            
            snaps['attr_preact'] = torch.full((self.n_inputs, self.n_attributes), np.nan)
            snaps['attr'] = snaps['attr_preact'].clone()
            attr_preact = self.calc_attr_preact(self.x_item[self.train_x_inds], self.x_context[self.train_x_inds])
            snaps['attr_preact'][self.train_x_inds] = attr_preact
            snaps['attr'][self.train_x_inds] = self.output_act_fn(attr_preact)
            
            # if there are both items and contexts, get the versions that are averaged over those
            if self.use_ctx:
                mean_snaps = {}
                for key, snap in snaps.items():
                    mean_snaps['item_' + key], mean_snaps['context_' + key] = get_snap_means(snap)
                snaps.update(mean_snaps)
            
        snaps.update(repr_snaps)
        return snaps           
    
    #--- Subroutines for specific training modes ---#

    def prepare_holdout(self, holdout_item=True, holdout_context=True):
        """
        Pick an item and context to hold out during regular training. Then, at each epoch,
        the number of additional epochs needed to reach a threshold of accuracy on the held-out
        items and contexts is recorded.

        Returns vectors of indices into items, contexts, and x/y that will still be used. 
        """
        ho_item_domain, ho_ctx_domain = dd.choose_k_inds(self.n_domains, 2)
        if holdout_item:
            ho_item_ind = ho_item_domain * dd.ITEMS_PER_DOMAIN + dd.choose_k_inds(dd.ITEMS_PER_DOMAIN, 1)
            ho_item = self.items[ho_item_ind]
            b_x_item_ho = self.x_item.eq(ho_item).all(axis=1).cpu()
            test_x_item_inds = torch.flatten(torch.nonzero(b_x_item_ho))
            print(f'Holding out item: {self.item_names[ho_item_ind]}')
        else:
            test_x_item_inds = []

        if holdout_context:
            ho_ctx_ind = dd.choose_k_inds(self.ctx_per_domain, 1)
            if not self.share_ctx:
                ho_ctx_ind += ho_ctx_domain * self.ctx_per_domain
            ho_context = self.contexts[ho_ctx_ind]
            b_x_ctx_ho = self.x_context.eq(ho_context).all(axis=1).cpu()
            test_x_ctx_inds = torch.flatten(torch.nonzero(b_x_ctx_ho))
            print(f'Holding out context: {self.context_names[ho_ctx_ind]}')
        else:
            test_x_ctx_inds = []

        # prepare array of which items to use during training
        train_x_inds = np.setdiff1d(range(self.n_inputs), np.concatenate([test_x_item_inds, test_x_ctx_inds]))

        return train_x_inds, test_x_item_inds, test_x_ctx_inds
    
    def prepare_domain_holdout(self, n=1):
        """Similar to prepare_holdout, but just hold out the last n domains"""
        held_out_domain_inds = range(self.n_domains-n, self.n_domains)
        held_out_domains = [dd.domain_name(kd) for kd in held_out_domain_inds]
        print(f'Holding out domain(s) {", ".join(held_out_domains)}')
        x_per_domain = dd.ITEMS_PER_DOMAIN * self.ctx_per_domain
        train_x_inds = np.arange(self.n_inputs - x_per_domain * n)
        test_x_inds = {dname: x_per_domain * kd + np.arange(x_per_domain, dtype=int)
                       for kd, dname in zip(held_out_domain_inds, held_out_domains)}
                
        return train_x_inds, test_x_inds
    
    def prepare_combo_testing(self, n_per_domain=1):
        """
        For each domain, pick one item/context pair to hold out.
        If possible, hold out a different one of each for each domain.
        Otherwise, at least try to ensure that each item and context within each domain is unique.
        """
        n_holdout_total = n_per_domain * self.n_domains

        if n_holdout_total <= dd.ITEMS_PER_DOMAIN:
            # hold out different item for each domain
            item_mods = dd.choose_k_inds(dd.ITEMS_PER_DOMAIN, n_holdout_total)
        elif n_per_domain <= dd.ITEMS_PER_DOMAIN:
            item_mods = torch.cat([dd.choose_k_inds(dd.ITEMS_PER_DOMAIN, n_per_domain) for _ in range(self.n_domains)])
        else:
            raise ValueError(f'Cannot hold out {n_per_domain} combinations per domain - only {dd.ITEMS_PER_DOMAIN} items')
        
        # offset to match hold-out items with domains
        item_domain_offsets = torch.repeat_interleave(torch.arange(0, self.n_items, dd.ITEMS_PER_DOMAIN, device='cpu'), n_per_domain)
        ho_items = item_domain_offsets + item_mods
        
        if n_holdout_total <= self.ctx_per_domain:
            ho_contexts = dd.choose_k_inds(self.ctx_per_domain, n_holdout_total)
        elif n_per_domain <= self.ctx_per_domain:
            ho_contexts = torch.cat([dd.choose_k_inds(self.ctx_per_domain, n_per_domain) for _ in range(self.n_domains)])
        else:
            raise ValueError(f'Cannot hold out {n_per_domain} combinations per domain - only {self.ctx_per_domain} contexts')

        if not self.share_ctx:
            # offset to match hold-out contexts with domains
            ho_contexts += torch.repeat_interleave(torch.arange(0, self.n_contexts, self.ctx_per_domain, device='cpu'), n_per_domain)

        print('Holding out: ' + ', '.join(
            [f'{self.item_names[ii]}/{self.context_names[ci]}' for ii, ci in zip(ho_items, ho_contexts)]
        ))
        
        test_x_inds = np.zeros(n_holdout_total)
        
        # Find indices of held out combos in full input arrays
        for k in range(n_holdout_total):
            b_x_item_ho = self.x_item.eq(self.items[ho_items[k]]).all(axis=1).cpu()
            b_x_ctx_ho = self.x_context.eq(self.contexts[ho_contexts[k]]).all(axis=1).cpu()
            b_x_ho = b_x_item_ho & b_x_ctx_ho
            
            assert torch.sum(b_x_ho) == 1, 'Uh-oh'            
            ind = torch.flatten(torch.nonzero(b_x_ho))[0]
            test_x_inds[k] = ind
            
        train_x_inds = np.setdiff1d(np.arange(self.n_inputs), test_x_inds)

        return train_x_inds, test_x_inds
    
    #--- Main training entry point ---#

    def do_training(self, **train_params):
        """
        Train the network for the specified number of epochs, etc.
        Return representation snapshots, training reports, and snapshot/report epochs.
        
        If batch_size is negative, use one batch per epoch.
        
        Holdout testing: train with one entire item, context, or both excluded, then
        periodically (every `reports_per_test` reports) test how many epochs are needed
        to train network up to obtaining test_thresh accuracy on the held out inputs.
        If holdout_testing is 'domain', hold out and test on the last domain.
        If 'train_held_out_only' is True, only includes held-out domains for training at test time.
        
        Combo testing: For each domain, hold out one item/context pair. At each report time,
        test the accuracy of the network on the held-out items and contexts.
        
        If param snapshots is true, also returns all weights and biases of the network at
        each snapshot epoch.
        """

        # Merge default params with overrides
        p = {**train_defaults, **train_params}
        
        holdout_testing = p['holdout_testing'].lower() if p['holdout_testing'] is not None else None

        # Deal with old domain holdout syntax
        if p['domains_to_hold_out'] > 0:
            if holdout_testing not in ['none', 'domain']:
                raise ValueError("Can't do both domain and non-domain hold out")
            holdout_testing = 'domain'
        elif holdout_testing == 'domain':
            if 'domains_to_hold_out' in train_params:  # case where holding out 0 domains was explicitly specified
                raise ValueError('Must hold out > 0 domains if doing domain holdout')
            p['domains_to_hold_out'] = 1
        
        optimizer = torch.optim.SGD(self.parameters(), lr=p['lr'])
        
        do_holdout_testing = holdout_testing is not None and holdout_testing != 'none'
        holdout_item = holdout_testing in ['full', 'item']
        holdout_ctx = holdout_testing in ['full', 'context', 'ctx']
        
        if do_holdout_testing and p['do_combo_testing']:
            raise NotImplementedError("That's too much, man - I'm not doing both holdout and combo testing!")
        
        self.train_x_inds = np.arange(self.n_inputs)
        test_x_item_inds = test_x_ctx_inds = None  # for item/context holdout testing
        included_inds_item = included_inds_ctx = None  # for item/context holdout testing
        test_x_inds = None  # for combo or domain holdout testing
        
        if do_holdout_testing:
            if holdout_testing == 'domain':
                self.train_x_inds, test_x_inds = self.prepare_domain_holdout(n=p['domains_to_hold_out'])
            else:
                self.train_x_inds, test_x_item_inds, test_x_ctx_inds = self.prepare_holdout(holdout_item, holdout_ctx)
                
                # which indices to use during testing
                included_inds_item = np.concatenate([self.train_x_inds, test_x_item_inds])
                included_inds_ctx = np.concatenate([self.train_x_inds, test_x_ctx_inds])
                
        elif p['do_combo_testing']:
            self.train_x_inds, test_x_inds = self.prepare_combo_testing(n_per_domain=p['n_combo_per_domain'])
            
        # reset cached properties, if necessary
        for attr_name in ['train_item_inds', 'train_ctx_inds']:
            if hasattr(self, attr_name):
                delattr(self, attr_name)

        etg_digits = len(str(p['test_max_epochs'])) + 2

        snap_epochs = util.calc_snap_epochs(**p)
        epoch_digits = len(str(snap_epochs[-1]))
        n_snaps = len(snap_epochs)
        snaps = []

        params = {}
        if p['param_snapshots']:
            params = {pname: torch.empty((n_snaps, *pval.shape)) for pname, pval in self.named_parameters()}

        n_report = (p['num_epochs']) // p['report_freq'] + 1
        n_etg = int((n_report-1) // p['reports_per_test'] + 1)
        train_reports = ['loss', 'accuracy', 'weighted_acc',
                         'weighted_acc_loose', 'weighted_acc_loose_indomain']
        test_reports = ['test_accuracy', 'test_weighted_acc',
                        'test_weighted_acc_loose', 'test_weighted_acc_loose_indomain']
        
        reports = {rname: np.zeros(n_report) for rname in train_reports}
        
        if holdout_item:
            reports['etg_item'] = np.zeros(n_etg, dtype=int)  # "epochs to generalize"
            
        if holdout_ctx:
            reports['etg_context'] = np.zeros(n_etg, dtype=int)
            
        if holdout_testing == 'domain':
            reports['etg_domain'] = np.zeros(n_etg, dtype=int)
            for kd in range(1, p['domains_to_hold_out']):
                reports[f'etg_domain{kd+1}'] = np.zeros(n_etg, dtype=int)
        
        if p['do_combo_testing']:
            for test_rname in test_reports:
                reports[test_rname] = np.zeros(n_report)

        for epoch in range(p['num_epochs'] + (1 if p['include_final_eval'] else 0)):

            # collect snapshot
            if epoch in snap_epochs:
                k_snap = snap_epochs.index(epoch)
                
                snaps.append(self.take_snapshots())
                
                with torch.no_grad():
                    if p['param_snapshots']:
                        for pname, pval in self.named_parameters():
                            params[pname][k_snap] = pval

            # report progress
            if epoch % p['report_freq'] == 0:
                k_report = epoch // p['report_freq']

                # get current performance
                perf_stats = self.evaluate_input_set(self.train_x_inds)

                report_str = (f'Epoch {epoch:{epoch_digits}d}: ' +
                              f'loss = {perf_stats["loss"]:7.3f}, ' +
                              f'weighted acc (binary) = {perf_stats["weighted_acc_loose"]:.3f}')

                for stat_type, stat in perf_stats.items():
                    reports[stat_type][k_report] = stat

                if do_holdout_testing and k_report % p['reports_per_test'] == 0:
                    k_test = int(k_report // p['reports_per_test'])
                    
                    # Do item and context generalize tests separately
                    if holdout_item:
                        item_etg, item_etg_string = self.generalize_test(
                            p['batch_size'], optimizer, included_inds_item, test_x_item_inds, test_crit=p['test_criterion'],
                            thresh=p['test_thresh'], max_epochs=p['test_max_epochs']
                        )
                        report_str += f', epochs for new item = {item_etg_string:>{etg_digits}}'
                        reports['etg_item'][k_test] = item_etg
                    
                    if holdout_ctx:
                        ctx_etg, ctx_etg_string = self.generalize_test(
                            p['batch_size'], optimizer, included_inds_ctx, test_x_ctx_inds, test_crit=p['test_criterion'],
                            thresh=p['test_thresh'], max_epochs=p['test_max_epochs']
                        )
                        report_str += f', epochs for new context = {ctx_etg_string:>{etg_digits}}'
                        reports['etg_context'][k_test] = ctx_etg
                        
                    if holdout_testing == 'domain':
                        for kd, (dname, this_test_inds) in enumerate(test_x_inds.items()):
                            if p['train_held_out_only']:
                                included_inds = this_test_inds
                            else:
                                included_inds = np.concatenate((self.train_x_inds, this_test_inds))
                            
                            domain_etg, domain_etg_string = self.generalize_test(
                                p['batch_size'], optimizer, included_inds, this_test_inds, test_crit=p['test_criterion'],
                                thresh=p['test_thresh'], max_epochs=p['test_max_epochs']
                            )
                            report_str += f'\n\tEpochs to learn domain {dname}: {domain_etg_string:>{etg_digits}}'
                            report_type = 'etg_domain' + (str(kd+1) if kd > 0 else '')
                            reports[report_type][k_test] = domain_etg
                        
                if p['do_combo_testing']:
                    test_perf_stats = self.evaluate_input_set(test_x_inds)
                    
                    report_str += f', test weighted acc (binary) = {test_perf_stats["weighted_acc_loose"]:.3f}'
                    
                    for stat_type, stat in test_perf_stats.items():
                        if stat_type != 'loss':
                            reports['test_' + stat_type][k_report] = stat
                                        
                print(report_str)

            # do training
            if epoch < p['num_epochs']:
                order = util.permute(self.train_x_inds)
                self.train_epoch(order, p['batch_size'], optimizer)
                if p['scheduler'] is not None:
                    p['scheduler'].step()
                    
        # concatenate snapshots and move to cpu
        if len(snaps) > 0:
            snaps_cpu = {stype: np.stack([s[stype].cpu().numpy() for s in snaps])
                         for stype in snaps[0]}
        else:
            snaps_cpu = {}

        ret_dict = {'snaps': snaps_cpu, 'reports': reports}
        
        if p['param_snapshots']:
            ret_dict['params'] = {pname: pval.cpu().numpy() for pname, pval in params.items()}
        
        return ret_dict


def train_n_nets(n=36, run_type='', net_params=None, train_params=None):
    """
    Do a series of runs and save results.
    This is the high-level training entry point that should almost always be used in practice.
    """

    combined_net_params = net_defaults.copy()
    if net_params is not None:
        for key, val in net_params.items():
            if key not in combined_net_params:
                raise KeyError(f'Unrecognized net param {key}')
            combined_net_params[key] = val
            
    combined_train_params = train_defaults.copy()
    if train_params is not None:
        for key, val in train_params.items():
            if key not in combined_train_params:
                raise KeyError(f'Unrecognized train param {key}')
            combined_train_params[key] = val

    snaps_all = []
    reports_all = []
    parameters_all = []
    ys = []
    train_x_inds = []
    
    per_net_params = {key: [] for key in callable_net_params + ['rng_seed']}

    net = None
    for i in range(n):
        print(f'Training Iteration {i + 1}')
        print('---------------------')

        net = DisjointDomainNet(**combined_net_params)
        
        for key, val in per_net_params.items():
            val.append(getattr(net, key))
            
        res = net.do_training(**combined_train_params)

        snaps_all.append(res['snaps'])
        reports_all.append(res['reports'])
        if 'params' in res:
            parameters_all.append(res['params'])

        ys.append(net.y.cpu().numpy())
        train_x_inds.append(net.train_x_inds)

        print('')

    snaps = {}
    for snap_type in snaps_all[0].keys():
        snaps[snap_type] = np.stack([snaps_one[snap_type] for snaps_one in snaps_all])

    reports = {}
    for report_type in reports_all[0].keys():
        reports[report_type] = np.stack([reports_one[report_type] for reports_one in reports_all])

    if len(parameters_all) > 0:
        parameters = {}
        for param_type in parameters_all[0].keys():
            parameters[param_type] = np.stack([params_one[param_type] for params_one in parameters_all])
    else:
        parameters = None

    if run_type != '':
        run_type += '_'

    save_name = f'data/{run_type}dd_res_{dt.now():%Y-%m-%d_%H-%M-%S}.npz'
    np.savez(save_name, snapshots=snaps, reports=reports, ys=ys, train_x_inds=train_x_inds,
             net_params=combined_net_params, per_net_params=per_net_params,
             train_params=combined_train_params, parameters=parameters)

    return save_name, net


def load_res_for_restoring(res_path):
    with np.load(res_path, allow_pickle=True) as resfile:
        parameters = resfile['parameters'].item()
        if parameters is None:
            raise RuntimeError('Cannot restore this network - parameters not saved')
        ys = resfile['ys']
        net_params = resfile['net_params'].item()
        train_params = resfile['train_params'].item()
        per_net_params = resfile['per_net_params'].item() if 'per_net_params' in resfile else {}
        train_x_inds = resfile['train_x_inds'] if 'train_x_inds' in resfile else None
        
    return ys, parameters, net_params, per_net_params, train_params, train_x_inds


def restore_loaded_net(ys, parameters, net_params, per_net_params, train_params, train_x_inds,
                       net_ind, epoch):
    try:
        include_final_eval = train_params['include_final_eval']
    except KeyError:
        include_final_eval = False
    snap_epochs = util.calc_snap_epochs(train_params['snap_freq'], train_params['num_epochs'],
                                        train_params['snap_freq_scale'], include_final_eval)
    if epoch == -1:
        epoch_ind = -1
    else:
        try:
            epoch_ind = snap_epochs.index(epoch)
        except ValueError:
            epoch_ind = np.argmin(np.abs([se - epoch for se in snap_epochs]))
            warnings.warn(f'Epoch {epoch} has no associated snapshot - using {snap_epochs[epoch_ind]} instead.')

    epoch_state_dict = {name: torch.tensor(value[net_ind, epoch_ind]) for name, value in parameters.items()}
    
    for key, val in per_net_params.items():
        net_params[key] = val[net_ind]
    
    net_params['verbose'] = False
    net = DisjointDomainNet(**net_params)
    net.load_state_dict(OrderedDict(epoch_state_dict))
    net.y = torch.tensor(ys[net_ind], device=net.device)
    if train_x_inds is not None:
        net.train_x_inds = train_x_inds[net_ind]
    
    return net, train_params


def restore_net(res_path, net_ind=0, epoch=-1):
    """Reload a network that has parameters saved from a specific epoch, or the closest possible saved epoch."""
    [*loaded_net_vars] = load_res_for_restoring(res_path)
    return restore_loaded_net(*loaded_net_vars, net_ind=net_ind, epoch=epoch)


def restore_each_net_over_epochs(res_path, epochs):
    """For each net saved in res_path, yield a generator that restores the net at each epoch in epochs."""
    [ys, *other_net_vars] = load_res_for_restoring(res_path)
    return (
        (
            restore_loaded_net(ys, *other_net_vars, net_ind=net_ind, epoch=epoch) 
            for epoch in epochs
        )
        for net_ind in range(len(ys))
    )


def restore_and_holdout_test(res_path, epochs, save_path=None,
                             net_restorer_generator=restore_each_net_over_epochs):
    """Restore each net saved in res_path for each epoch in epochs and do domain holdout test"""
    if save_path is None:
        save_path = os.path.splitext(res_path)[0] + '_domain_holdout.npz'
        
    etg_all = []
    for i, net_gen in enumerate(net_restorer_generator(res_path, epochs)):
        etg_net = np.zeros(len(epochs))
        print(f'Network {i} testing start')
        print('-------------------------')
        for j, (epoch, (net, train_params)) in enumerate(zip(epochs, net_gen)):
            batch_size = train_params['batch_size']
            optimizer = torch.optim.SGD(net.parameters(), lr=train_params['lr'])
            included_inds = np.arange(net.n_inputs)
            test_inds = np.arange(len(net.train_x_inds), net.n_inputs)
            thresh = train_params['test_thresh']
            max_epochs = train_params['test_max_epochs']
            if 'test_criterion' in train_params:
                test_crit = train_params['test_criterion']
            else:
                test_crit = train_defaults['test_criterion']
            
            etg, etg_str = net.generalize_test(
                batch_size, optimizer, included_inds, test_inds, test_crit=test_crit,
                thresh=thresh, max_epochs=max_epochs)
            etg_net[j] = etg
            print(f'Epoch {epoch}: {etg_str} epochs to generalize')
            
        print()
        etg_all.append(etg_net)
        
    etg_all = np.stack(etg_all)
    np.savez(save_path, test_epochs=epochs, etg=etg_all)