I did some searching and found this.
Binary vs one hot
Thanks fleabay!
I added in the following statement:
flat_file = np.transpose(flat_file, (2, 0, 1))
Now I'm getting a different error:
RuntimeError: mat1 and mat2 shapes cannot be multiplied (198x10816 and 198x128)
The full code is:
import numpy as np
import math
import cv2
import random
import torch
from torch import flatten
from torch.autograd import Variable
import torch.nn as nn
import os.path
from os import path
img_width = 64
num_channels = 3
num_input_components = img_width*img_width*num_channels
num_output_components = 1
num_epochs = 100
learning_rate = 0.00001
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv_layer1 = nn.Conv2d(in_channels=num_channels, out_channels=32, kernel_size=3)
self.conv_layer2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3)
self.max_pool1 = nn.MaxPool2d(kernel_size = 2, stride = 2)
self.conv_layer3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.conv_layer4 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3)
self.max_pool2 = nn.MaxPool2d(kernel_size = 2, stride = 2)
self.fc1 = nn.Linear(1600, 128)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(128, num_output_components)
# Progresses data across layers
def forward(self, x):
out = self.conv_layer1(x)
out = self.conv_layer2(out)
out = self.max_pool1(out)
out = self.conv_layer3(out)
out = self.conv_layer4(out)
out = self.max_pool2(out)
out = out.reshape(out.size(0), -1)
out = self.fc1(out)
out = self.relu1(out)
out = self.fc2(out)
return out
"""
def __init__(self):
# call the parent constructor
super(Net, self).__init__()
# initialize first set of CONV => RELU => POOL layers
self.conv1 = nn.Conv2d(in_channels=num_channels, out_channels=20, kernel_size=(5, 5))
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
# initialize second set of CONV => RELU => POOL layers
self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
self.relu2 = nn.ReLU()
self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
# initialize first (and only) set of FC => RELU layers
self.fc1 = nn.Linear(in_features=800, out_features=500)
self.relu3 = nn.ReLU()
# initialize our softmax classifier
self.fc2 = nn.Linear(in_features=500, out_features=num_output_components)
self.logSoftmax = nn.LogSoftmax(dim=1)
def forward(self, x):
# pass the input through our first set of CONV => RELU =>
# POOL layers
x = self.conv1(x)
x = self.relu1(x)
x = self.maxpool1(x)
# pass the output from the previous layer through the second
# set of CONV => RELU => POOL layers
x = self.conv2(x)
x = self.relu2(x)
x = self.maxpool2(x)
# flatten the output from the previous layer and pass it
# through our only set of FC => RELU layers
x = flatten(x, 1)
x = self.fc1(x)
x = self.relu3(x)
# pass the output to our softmax classifier to get our output
# predictions
x = self.fc2(x)
output = self.logSoftmax(x)
# return the output predictions
return output
"""
"""
def __init__(self):
super(Net, self).__init__()
self.hidden1 = torch.nn.Linear(num_input_components, 8192)
self.hidden2 = torch.nn.Linear(8192, 1024)
self.hidden3 = torch.nn.Linear(1024, 128)
self.predict = torch.nn.Linear(128, num_output_components)
def forward(self, x):
x = torch.tanh(self.hidden1(x))
x = torch.tanh(self.hidden2(x))
x = torch.tanh(self.hidden3(x))
x = self.predict(x) # linear output
return x
"""
class float_image:
def __init__(self, img):
self.img = img
class image_type:
def __init__(self, img_type, float_img):
self.img_type = img_type
self.float_img = float_img
net = Net()
if False: #path.exists('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'):
net.load_state_dict(torch.load('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'))
print("loaded file successfully")
else:
print("training...")
all_train_files = []
file_count = 0
path = 'training_set\\cats\\'
filenames = next(os.walk(path))[2]
for f in filenames:
file_count = file_count + 1
if file_count >= 100:
break;
print(path + f)
img = cv2.imread(path + f).astype(np.float32)
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0 #np.asarray(res).flatten() / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
all_train_files.append(image_type(0, flat_file))
file_count = 0
path = 'training_set\\dogs\\'
filenames = next(os.walk(path))[2]
for f in filenames:
file_count = file_count + 1
if file_count >= 100:
break;
print(path + f)
img = cv2.imread(path + f).astype(np.float32)
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0 #np.asarray(res).flatten() / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
all_train_files.append(image_type(1, flat_file))
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate)
loss_func = torch.nn.MSELoss()
batch = np.zeros((len(all_train_files), num_channels, img_width, img_width), dtype=np.float32)
ground_truth = np.zeros((len(all_train_files), 1), dtype=np.float32)
random.shuffle(all_train_files)
count = 0
for i in all_train_files:
batch[count] = i.float_img
ground_truth[count] = i.img_type
count = count + 1
for epoch in range(num_epochs):
x = Variable(torch.from_numpy(batch))
y = Variable(torch.from_numpy(ground_truth))
prediction = net(x)
loss = loss_func(prediction, y)
print(epoch, loss)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
#torch.save(net.state_dict(), 'weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth')
path = 'test_set\\cats\\'
filenames = next(os.walk(path))[2]
cat_count = 0
total_count = 0
for f in filenames:
# print(path + f)
img = cv2.imread(path + f).astype(np.float32)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0# np.asarray(res).flatten() / 255.0
batch = torch.from_numpy(flat_file)
prediction = net(Variable(batch))
if prediction < 0.5:
cat_count = cat_count + 1
total_count = total_count + 1
# print(batch)
# print(prediction)
print(cat_count / total_count)
print(total_count)
path = 'test_set\\dogs\\'
filenames = next(os.walk(path))[2]
dog_count = 0
total_count = 0
for f in filenames:
# print(path + f)
img = cv2.imread(path + f).astype(np.float32)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0 # np.asarray(res).flatten() / 255.0
batch = torch.from_numpy(flat_file)
prediction = net(Variable(batch))
if prediction > 0.5:
dog_count = dog_count + 1
total_count = total_count + 1
# print(batch)
# print(prediction)
print(dog_count / total_count)
print(total_count)
So I fine-tuned the size of the tensors, and now I'm getting this error:
Traceback (most recent call last):
File "img_train.py", line 197, in <module>
prediction = net(x)
...
RuntimeError: mat1 and mat2 shapes cannot be multiplied (198x8450 and 198x8450)
I'm flabbergasted. Why don't these two shapes multiply, when they are the same size?
import numpy as np
import math
import cv2
import random
import torch
from torch import flatten
from torch.autograd import Variable
import torch.nn as nn
import os.path
from os import path
img_width = 64
num_channels = 3
num_input_components = img_width*img_width*num_channels
num_output_components = 1
num_epochs = 100
learning_rate = 0.00001
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
# call the parent constructor
super(Net, self).__init__()
# initialize first set of CONV => RELU => POOL layers
self.conv1 = nn.Conv2d(in_channels=num_channels, out_channels=20, kernel_size=(5, 5))
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
# initialize second set of CONV => RELU => POOL layers
self.conv2 = nn.Conv2d(in_channels=20, out_channels=50, kernel_size=(5, 5))
self.relu2 = nn.ReLU()
self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
# initialize first (and only) set of FC => RELU layers
self.fc1 = nn.Linear(in_features=198, out_features=8450)
self.relu3 = nn.ReLU()
# initialize our softmax classifier
self.fc2 = nn.Linear(in_features=500, out_features=num_output_components)
self.logSoftmax = nn.LogSoftmax(dim=1)
def forward(self, x):
# pass the input through our first set of CONV => RELU =>
# POOL layers
x = self.conv1(x)
x = self.relu1(x)
x = self.maxpool1(x)
# pass the output from the previous layer through the second
# set of CONV => RELU => POOL layers
x = self.conv2(x)
x = self.relu2(x)
x = self.maxpool2(x)
# flatten the output from the previous layer and pass it
# through our only set of FC => RELU layers
x = flatten(x, 1)
x = self.fc1(x)
x = self.relu3(x)
# pass the output to our softmax classifier to get our output
# predictions
x = self.fc2(x)
output = self.logSoftmax(x)
# return the output predictions
return output
"""
def __init__(self):
super(Net, self).__init__()
self.hidden1 = torch.nn.Linear(num_input_components, 8192)
self.hidden2 = torch.nn.Linear(8192, 1024)
self.hidden3 = torch.nn.Linear(1024, 128)
self.predict = torch.nn.Linear(128, num_output_components)
def forward(self, x):
x = torch.tanh(self.hidden1(x))
x = torch.tanh(self.hidden2(x))
x = torch.tanh(self.hidden3(x))
x = self.predict(x) # linear output
return x
"""
class float_image:
def __init__(self, img):
self.img = img
class image_type:
def __init__(self, img_type, float_img):
self.img_type = img_type
self.float_img = float_img
net = Net()
if False: #path.exists('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'):
net.load_state_dict(torch.load('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'))
print("loaded file successfully")
else:
print("training...")
all_train_files = []
file_count = 0
path = 'training_set\\cats\\'
filenames = next(os.walk(path))[2]
for f in filenames:
file_count = file_count + 1
if file_count >= 100:
break;
print(path + f)
img = cv2.imread(path + f).astype(np.float32)
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0 #np.asarray(res).flatten() / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
all_train_files.append(image_type(0, flat_file))
file_count = 0
path = 'training_set\\dogs\\'
filenames = next(os.walk(path))[2]
for f in filenames:
file_count = file_count + 1
if file_count >= 100:
break;
print(path + f)
img = cv2.imread(path + f).astype(np.float32)
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0 #np.asarray(res).flatten() / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
all_train_files.append(image_type(1, flat_file))
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate)
loss_func = torch.nn.MSELoss()
batch = np.zeros((len(all_train_files), num_channels, img_width, img_width), dtype=np.float32)
ground_truth = np.zeros((len(all_train_files), 1), dtype=np.float32)
random.shuffle(all_train_files)
count = 0
for i in all_train_files:
batch[count] = i.float_img
ground_truth[count] = i.img_type
count = count + 1
for epoch in range(num_epochs):
x = Variable(torch.from_numpy(batch))
y = Variable(torch.from_numpy(ground_truth))
prediction = net(x)
loss = loss_func(prediction, y)
print(epoch, loss)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
#torch.save(net.state_dict(), 'weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth')
path = 'test_set\\cats\\'
filenames = next(os.walk(path))[2]
cat_count = 0
total_count = 0
for f in filenames:
# print(path + f)
img = cv2.imread(path + f).astype(np.float32)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0# np.asarray(res).flatten() / 255.0
batch = torch.from_numpy(flat_file)
prediction = net(Variable(batch))
if prediction < 0.5:
cat_count = cat_count + 1
total_count = total_count + 1
# print(batch)
# print(prediction)
print(cat_count / total_count)
print(total_count)
path = 'test_set\\dogs\\'
filenames = next(os.walk(path))[2]
dog_count = 0
total_count = 0
for f in filenames:
# print(path + f)
img = cv2.imread(path + f).astype(np.float32)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0 # np.asarray(res).flatten() / 255.0
batch = torch.from_numpy(flat_file)
prediction = net(Variable(batch))
if prediction > 0.5:
dog_count = dog_count + 1
total_count = total_count + 1
# print(batch)
# print(prediction)
print(dog_count / total_count)
print(total_count)
In matrix multiplication the number of columns of the first matrix should match the number of rows of the second matrix.
I think that I need to read a couple of books on the subject, before I proceed. Thanks all!
OK, so I've made some progress.
I'm now using one-hot.
I just don't understand why 8*img_width*img_width comes into play? Why does the magic number 8 come into play?
import numpy as np
import math
import cv2
import random
import torch
from torch import flatten
from torch.autograd import Variable
import torch.nn as nn
import os.path
from os import path
img_width = 32
num_channels = 3
num_input_components = img_width*img_width*num_channels
num_output_components = 2
num_epochs = 100
learning_rate = 0.0001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Net(nn.Module):
def __init__(self, num_channels, num_output_components, all_train_files_len):
# call the parent constructor
super(Net, self).__init__()
self.conv1 = nn.Conv2d(num_channels, img_width, kernel_size=(3,3), stride=1, padding=1)
self.act1 = nn.ReLU()
self.drop1 = nn.Dropout(0.3)
self.conv2 = nn.Conv2d(img_width, img_width, kernel_size=(3,3), stride=1, padding=1)
self.act2 = nn.ReLU()
self.pool2 = nn.MaxPool2d(kernel_size=(2, 2))
self.flat = nn.Flatten()
self.fc3 = nn.Linear(8*img_width*img_width, 512)
self.act3 = nn.ReLU()
self.drop3 = nn.Dropout(0.5)
self.fc4 = nn.Linear(512, num_output_components)
def forward(self, x):
# input 3x32x32, output 32x32x32
x = self.act1(self.conv1(x))
x = self.drop1(x)
# input 32x32x32, output 32x32x32
x = self.act2(self.conv2(x))
# input 32x32x32, output 32x16x16
x = self.pool2(x)
# input 32x16x16, output 8192
x = self.flat(x)
# input 8192, output 512
x = self.act3(self.fc3(x))
x = self.drop3(x)
# input 512, output 10
x = self.fc4(x)
return x
"""
def __init__(self):
super(Net, self).__init__()
self.hidden1 = torch.nn.Linear(num_input_components, 8192)
self.hidden2 = torch.nn.Linear(8192, 1024)
self.hidden3 = torch.nn.Linear(1024, 128)
self.predict = torch.nn.Linear(128, num_output_components)
def forward(self, x):
x = torch.tanh(self.hidden1(x))
x = torch.tanh(self.hidden2(x))
x = torch.tanh(self.hidden3(x))
x = self.predict(x) # linear output
return x
"""
class float_image:
def __init__(self, img):
self.img = img
class image_type:
def __init__(self, img_type, float_img):
self.img_type = img_type
self.float_img = float_img
if False: #path.exists('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'):
net.load_state_dict(torch.load('weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth'))
print("loaded file successfully")
else:
print("training...")
all_train_files = []
file_count = 0
path = 'training_set\\cats\\'
filenames = next(os.walk(path))[2]
for f in filenames:
file_count = file_count + 1
if file_count >= 10000:
break;
print(path + f)
img = cv2.imread(path + f).astype(np.float32)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
all_train_files.append(image_type(0, flat_file))
file_count = 0
path = 'training_set\\dogs\\'
filenames = next(os.walk(path))[2]
for f in filenames:
file_count = file_count + 1
if file_count >= 10000:
break;
print(path + f)
img = cv2.imread(path + f).astype(np.float32)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
all_train_files.append(image_type(1, flat_file))
net = Net(num_channels, num_output_components, len(all_train_files))
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate)
loss_func = torch.nn.MSELoss()
batch = np.zeros((len(all_train_files), num_channels, img_width, img_width), dtype=np.float32)
ground_truth = np.zeros((len(all_train_files), num_output_components), dtype=np.float32)
random.shuffle(all_train_files)
count = 0
for i in all_train_files:
batch[count] = i.float_img
if i.img_type == 0:
ground_truth[count][0] = 1
ground_truth[count][1] = 0
elif i.img_type == 1:
ground_truth[count][0] = 0
ground_truth[count][1] = 1
count = count + 1
x = Variable(torch.from_numpy(batch))
y = Variable(torch.from_numpy(ground_truth))
for epoch in range(num_epochs):
prediction = net(x)
loss = loss_func(prediction, y)
print(epoch, loss)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
#torch.save(net.state_dict(), 'weights_' + str(num_input_components) + '_' + str(num_epochs) + '.pth')
path = 'test_set\\cats\\'
filenames = next(os.walk(path))[2]
cat_count = 0
total_count = 0
for f in filenames:
# print(path + f)
img = cv2.imread(path + f).astype(np.float32)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
batch = torch.zeros((1, num_channels, img_width, img_width), dtype=torch.float32)
batch[0] = torch.from_numpy(flat_file)
prediction = net(Variable(batch))
if prediction[0][0] > prediction[0][1]:
cat_count = cat_count + 1
total_count = total_count + 1
# print(batch)
# print(prediction)
print(cat_count / total_count)
print(total_count)
path = 'test_set\\dogs\\'
filenames = next(os.walk(path))[2]
dog_count = 0
total_count = 0
for f in filenames:
# print(path + f)
img = cv2.imread(path + f).astype(np.float32)
res = cv2.resize(img, dsize=(img_width, img_width), interpolation=cv2.INTER_LINEAR)
flat_file = res / 255.0
flat_file = np.transpose(flat_file, (2, 0, 1))
batch = torch.zeros((1, num_channels, img_width, img_width), dtype=torch.float32)
batch[0] = torch.from_numpy(flat_file)
prediction = net(Variable(batch))
if prediction[0][0] < prediction[0][1]:
dog_count = dog_count + 1
total_count = total_count + 1
# print(batch)
# print(prediction)
print(dog_count / total_count)
print(total_count)
I just don't understand why 8*img_width*img_width comes into play? Why does the magic number 8 come into play?
You start with a 3x(32x32) image. Then you do a convolution that brings that up to 32x(32x32). At some point you do a MaxPool2d operation that shrinks the image to 16x16, so now you have 32x(16x16). After you flatten, that's just 8192 numbers, forgetting the structure as an image. That happens to be 8*32*32, but it's not a very good way to look at it. As you said, the “8” is kind of meaningless (it's the number of channels you are using divided by 4 because one of your operations shrank the image by a factor of 2 in each dimension).
Do you have a better example code? There are several on the Internet, but I haven't found one that works sort of perfectly.
I found a different code, but it only gets it right like 70% of the time, which is still no good.
Is is the data that makes it so underwhelming?
class Net(torch.nn.Module):
def __init__(self, num_channels, num_output_components, all_train_files_len):
super().__init__()
self.model = torch.nn.Sequential(
#Input = 3 x 32 x 32, Output = 32 x 32 x 32
torch.nn.Conv2d(in_channels = num_channels, out_channels = 32, kernel_size = 3, padding = 1),
torch.nn.ReLU(),
#Input = 32 x 32 x 32, Output = 32 x 16 x 16
torch.nn.MaxPool2d(kernel_size=2),
#Input = 32 x 16 x 16, Output = 64 x 16 x 16
torch.nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
torch.nn.ReLU(),
#Input = 64 x 16 x 16, Output = 64 x 8 x 8
torch.nn.MaxPool2d(kernel_size=2),
#Input = 64 x 8 x 8, Output = 64 x 8 x 8
torch.nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, padding = 1),
torch.nn.ReLU(),
#Input = 64 x 8 x 8, Output = 64 x 4 x 4
torch.nn.MaxPool2d(kernel_size=2),
torch.nn.Flatten(),
torch.nn.Linear(64*4*4, all_train_files_len),
torch.nn.ReLU(),
torch.nn.Linear(all_train_files_len, num_output_components)
)
def forward(self, x):
return self.model(x)