""" Modified MNIST training set for binary image classification """ import json import pathlib from relations import Relation, random_relation from itertools import product def import_mnist_data(data_type): """ Create an iterator for MNIST data. The resulting JSON files have each line representing a greyscale image of a handwritten digit. Each line is a dictionary whose keys are integers between 1 and 255, or the string 'label'. The values associated to the integer keys are lists of the coordinates at which the greyscale value for the MNIST image is equal to the key. For example, if the greyscale value 25 is found at coordinate [2,14], then the key 25 would be associated to a list of pairs, one of which is [2,14]. Any pairs not belonging to a value in the dictionary are assumed to be assigned greyscale value 0. The 'label' key has an integer value between 0 and 9, indicated the intended handwritten digit for the corresponding image. Argument: data_type (str): Either 'train' or 'test', depending on which data one would like to convert. Yields: dict: The dictionary of data specifying a greyscale image and its intended handwritten digit. """ with open(str(pathlib.Path.cwd().parent) + \ '//..//JSONforMNIST//{}_data.json'.format(data_type), 'r') as read_file: for line in read_file: data = json.loads(line) # By default, all the integer keys in the dictionary returned from # the JSON file will be converted to strings. Let's undo this. cleaned_data = \ {int(key): data[key] for key in data if key != 'label'} cleaned_data['label'] = data['label'] yield cleaned_data def greyscale_to_binary(image, cutoff=127): """ Convert a greyscale image from the MNIST training set to a binary relation. Arguments: image (dict): A dictionary representing a greyscale image as described in `import_mnist_data`. cutoff (int): Any pixel coordinates in `image` which are over this value will be taken to be in the relation. Returns: Relation: A binary relation on a universe of size 28 whose pairs are those coordinates from `image` which are at least as large as `cutoff`. """ pairs = [] for val in range(cutoff, 256): if val in image: pairs += image[val] return Relation(pairs, 28) def mnist_binary_relations(data_type, cutoff=127): """ Create an iterator for binary relations coming from MNIST data. Arguments: data_type (str): Either 'train' or 'test', depending on which data one would like to examine. cutoff: Any pixel coordinates in a greyscale image which are over this value will be taken to be in the corresponding relation. Yields: tuple: A binary relation corresponding to a greyscale image from an MNIST dataset and its corresponding integer label. """ data = import_mnist_data(data_type) for dic in data: yield greyscale_to_binary(dic, cutoff), dic['label'] def build_training_data(pairs, data_type, cutoff=127): """ Create an iterable of pairs for training or testing a discrete neural net using the MNIST datasets. Either the train data or the test data from MNIST may be used. The following values provided in `pairs` will be substituted for binary relations: 0, 1, 2, 3, 4, 5, 6, 7, 8, or 9: These `int`s will be replaced with a corresponding handwritten digit from MNIST dataset. 'Empty', 'Full': These strings will be replaced with the empty binary relation and the full binary relation, respectively. Arguments: pairs (iterable of tuple): A sequence of pairs, the first entry being a tuple of inputs and the second entry being a tuple of outputs. It is assumed that all the first-entry tuples have the same length, which is the number of input nodes in a neural net to be trained/tested on such data. Similarly, the second-entry tuples are assumed to have the same length, which is the number of output nodes in a neural net to be trained/tested on such data. See the description above for possible values that these tuples may contain. data_type (str): Either 'train' or 'test', depending on which data one would like to examine. cutoff (int): Any pixel coordinates in a greyscale image which are over this value will be taken to be in the corresponding relation. Yields: tuple: A pair whose first entry is a dictionary indicating that a tuple of binary relations is to be fed into a discrete neural net as the inputs `x0`, `x1`, `x2`, etc. and whose second entry is a tuple of binary relations which should appear as the corresponding outputs. """ # Create a dictionary for the substitutions described above. The images # corresponding to the digits will be updated dynamically from the MNIST # training data. substitution_dic = {i: None for i in range(10)} substitution_dic['Empty'] = Relation(tuple(), 28, 2) substitution_dic['Full'] = Relation(product(range(28), repeat=2), 28) substitution_dic['One pixel'] = Relation(((0, 0),), 28) # Load the MNIST data data = mnist_binary_relations(data_type, cutoff) # Initialize the images corresponding to the digits. for i in range(10): # For each digit, we try to find a candidate image. while not substitution_dic[i]: # We pull the next image from MNIST. new_image = next(data) # If an image for that digit hasn't been found yet, regardless of # whether it was the one we intended to look for, that image will # be added as the one representing its digit in `substitution_dic`. if not substitution_dic[new_image[1]]: substitution_dic[new_image[1]] = new_image[0] for pair in pairs: # Update one of the digits using the next values from MNIST. new_image = next(data) substitution_dic[new_image[1]] = new_image[0] # Choose a new random image. substitution_dic['Random'] = random_relation(28) yield {'x{}'.format(i): substitution_dic[pair[0][i]] for i in range(len(pair[0]))}, \ tuple(substitution_dic[pair[1][i]] for i in range(len(pair[1]))) def binary_mnist_zero_one(quantity_of_zeroes, data_type, \ quantity_of_ones=None, cutoff=127): """ Create a data set for training a discrete neural net to recognize handwritten zeroes and ones. Zeroes are labeled with the empty relation and ones are labeled with the full relation. Arguments: quantity_of_zeroes (int): The number of examples of handwritten zeroes to show. data_type (str): Either 'train' or 'test', depending on which data one would like to examine. quantity_of_ones (int): The number of examples of handwritten ones to show. cutoff (int): Any pixel coordinates in a greyscale image which are over this value will be taken to be in the corresponding relation. Returns: iterable: An iterable of training data where handwritten zeroes and ones are mapped to full and empty relations. """ # If the number of ones to use is not specified, it is assumed to be the # same as the number of zeroes. if quantity_of_ones is None: quantity_of_ones = quantity_of_zeroes pairs = [((0,), ('Empty',)) for _ in range(quantity_of_zeroes)] pairs += [((1,), ('Full',)) for _ in range(quantity_of_ones)] return build_training_data(pairs, data_type, cutoff) def experiment_mnist_zero_one(quantity_of_zeroes, data_type, \ quantity_of_ones=None, cutoff=127): # If the number of ones to use is not specified, it is assumed to be the # same as the number of zeroes. if quantity_of_ones is None: quantity_of_ones = quantity_of_zeroes pairs = [(('Random',), (0,)) for _ in range(quantity_of_zeroes)] pairs += [(('Random',), (1,)) for _ in range(quantity_of_ones)] return build_training_data(pairs, data_type, cutoff)