Voxel51
Published in

Voxel51

Tunnel vision in computer vision: can ChatGPT see?

Depiction of an AI trying to see. Generated by DALL-E2.

What is ChatGPT?

Where ChatGPT excels

Commonplace code

import numpy as np

def fft(x):
"""Compute the discrete Fourier Transform of the 1D array x"""
x = np.asarray(x, dtype=float)
N = x.shape[0]

if N <= 1: # base case
return x

# split the array into even and odd components
even = fft(x[0::2])
odd = fft(x[1::2])

# compute the FFT of the even and odd components
T = [np.exp(-2j*np.pi*k/N)*odd[k] for k in range(N//2)]
return np.concatenate([even + T, even - T])
def min_pooling(inputs, pool_size):
# Get the dimensions of the input array
n_rows, n_cols, n_channels = inputs.shape

# Calculate the size of the output array
out_rows = n_rows // pool_size
out_cols = n_cols // pool_size

# Create an empty array to store the output
outputs = np.zeros((out_rows, out_cols, n_channels))

# Iterate over the input array and compute the output
for r in range(out_rows):
for c in range(out_cols):
for k in range(n_channels):
# Extract a patch of the input array
patch = inputs[r*pool_size:(r+1)*pool_size, c*pool_size:(c+1)*pool_size, k]

# Compute the min of the patch
outputs[r, c, k] = np.min(patch)

return outputs

Dressed method calls

from PIL import Image, ImageFilter

# Open the image
im = Image.open('image.jpg')

# Apply a sharpening filter
sharpened_im = im.filter(ImageFilter.SHARPEN)

# Save the sharpened image
sharpened_im.save('sharpened_image.jpg')
from skimage import io
from skimage.filters import threshold_otsu

# Load the image
image = io.imread('image.png')

# Convert the image to grayscale
gray_image = np.mean(image, axis=2)

# Calculate the Otsu threshold
threshold = threshold_otsu(gray_image)

# Apply the threshold to the image to create a binary image
binary_image = gray_image > threshold
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform
from PIL import Image

def tsne(images, perplexity=30.0, learning_rate=200.0, n_components=2):
# Convert the images to grayscale
grayscale_images = [np.array(image.convert('L')) for image in images]

# Stack the grayscale images into a single array
stacked_images = np.stack(grayscale_images)

# Flatten the stacked images into a single array
flattened_images = stacked_images.reshape(len(images), -1)

# Compute the pairwise distances between the flattened images
distances = squareform(pdist(flattened_images, 'euclidean'))

# Compute the low-dimensional embedding of the images using t-SNE
tsne = TSNE(perplexity=perplexity, learning_rate=learning_rate, n_components=n_components, init='random')
embedding = tsne.fit_transform(distances)

return embedding

Concatenating simple components

import matplotlib.pyplot as plt
from skimage import io, transform
from skimage.util import random_noise

# load and resize images
img1 = io.imread("image1.png")
img1 = transform.resize(img1, (100, 200))
img2 = io.imread("image2.png")
img2 = transform.resize(img2, (100, 200))

# augment dataset with 6 additional images
augmented_imgs = []
for i in range(6):
augmented_imgs.append(random_noise(img1))
augmented_imgs.append(random_noise(img2))

# blend images
blended_img = img1 * 0.5 + img2 * 0.5
for img in augmented_imgs:
blended_img += img * 0.5 / len(augmented_imgs)

# plot all images in a grid
fig, ax = plt.subplots(nrows=3, ncols=3)
ax[0, 0].imshow(img1)
ax[0, 1].imshow(img2)
ax[0, 2].imshow(blended_img)
for i in range(6):
ax[1 + i // 3, i % 3].imshow(augmented_imgs[i])

# add text to third plot
ax[0, 2].set_title("Resized, augmented, and blended images")

Surprising successes

  1. Performing segmentation on 3d point clouds (it used DBSCAN)
  2. Using DeOldify to restore color in old images
  3. Perform object tracking in videos

Where it falters

Look out for the long tail

import tensorflow as tf

# Define the input layer
input_layer = tf.keras.layers.Input(shape=(None, 3))

# Use a shared MLP to process the input point cloud
mlp1 = tf.keras.layers.Dense(64, activation="relu")
mlp2 = tf.keras.layers.Dense(64, activation="relu")
mlp3 = tf.keras.layers.Dense(64, activation="relu")
x = mlp1(input_layer)
x = mlp2(x)
x = mlp3(x)

# Use max pooling to get the features of the entire point cloud
global_features = tf.keras.layers.GlobalMaxPooling1D()(x)

# Use a fully connected layer to predict the output
output_layer = tf.keras.layers.Dense(units=1, activation=None)(global_features)

# Create the model
model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
import torch
from torch import nn

class PointNet(nn.Module):
def __init__(self):
super(PointNet, self).__init__()

# define the neural network layers
self.fc1 = nn.Linear(3, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.fc4 = nn.Linear(64, 128)
self.fc5 = nn.Linear(128, 1024)
self.fc6 = nn.Linear(1024, 512)
self.fc7 = nn.Linear(512, 256)
self.fc8 = nn.Linear(256, 128)
self.fc9 = nn.Linear(128, 64)
self.fc10 = nn.Linear(64, 10)

def forward(self, x):
# apply the neural network layers to the input point cloud
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = F.relu(self.fc8(x))
x = F.relu(self.fc9(x))
x = self.fc10(x)
return x

Math manipulations get messy

def otsus_method(img):
# calculate histogram of grayscale values
histogram = np.zeros(256)
for i in range(img.shape[0]):
for j in range(img.shape[1]):
histogram[img[i, j]] += 1

# normalize histogram
histogram = histogram / img.size

# calculate cumulative sum of normalized histogram
cumulative_sum = np.cumsum(histogram)

# calculate cumulative sum of squares of normalized histogram
cumulative_sum_of_squares = np.cumsum(histogram ** 2)

# initialize variables
optimal_threshold = 0
max_variance = 0

# iterate over all possible threshold values
for i in range(256):
# calculate background and foreground probabilities
p_background = cumulative_sum[i]
p_foreground = 1 - p_background

# calculate means
mean_background = np.sum(np.arange(256) * histogram[:i+1]) / p_background
mean_foreground = np.sum(np.arange(i+1, 256) * histogram[i+1:]) / p_foreground

# calculate between-class variance
variance = p_background * p_foreground * (mean_background - mean_foreground) ** 2

# update optimal threshold if necessary
if variance > max_variance:
max_variance = variance
optimal_threshold = i

return optimal_threshold
Error thrown after running ChatGPT generated code for Otsu thresholding
def hough_transform(img, theta_resolution=1, rho_resolution=1):
# convert image to grayscale
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# detect edges in the image using Canny edge detection
edges = cv2.Canny(img, 50, 200)

# create accumulator array with dimensions determined by theta and rho resolutions
theta = np.linspace(-90, 90, 180 // theta_resolution)
rho = np.linspace(-np.sqrt(2) * img.shape[0], np.sqrt(2) * img.shape[0], 2 * np.sqrt(2) * img.shape[0] // rho_resolution)
accumulator = np.zeros((len(theta), len(rho)))

# find non-zero edge pixels in the image
y_idxs, x_idxs = np.nonzero(edges)

# convert x and y indices to cartesian coordinates
points = np.stack([x_idxs, y_idxs], axis=1)

# iterate over all edge points
for x, y in points:
# iterate over all theta values
for i, t in enumerate(theta):
# calculate rho for current theta value
r = x * np.cos(t) + y * np.sin(t)

# find closest rho value and increment corresponding accumulator bin
j = np.argmin(np.abs(rho - r))
accumulator[i, j] += 1

return accumulator, theta, rho
Error thrown after running ChatGPT generated code for Hough transform

Expansive solutions breed errors

import pytesseract

# Load the license plate image
# image = cv2.imread("license_plate.png")

# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply Otsu's thresholding to binarize the image
threshold, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)

# Use Tesseract to perform OCR on the license plate image
text = pytesseract.image_to_string(binary, lang="eng")

# Print the recognized text
print(text)

Where to exercise extreme caution

# Define the names and years of the 10 best object detection models
model_names = ["Faster R-CNN", "YOLOv3", "SSD", "RetinaNet", "Mask R-CNN", "R-FCN", "SSD with FPN", "YOLOv4", "Grid R-CNN", "FCOS"]
model_years = [2015, 2018, 2016, 2017, 2017, 2016, 2017, 2020, 2019, 2019]

# Define the inference times of the 10 best object detection models (in milliseconds)
model_inference_times = [100, 30, 20, 40, 60, 50, 25, 15, 35, 45]
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
# Calculate the depth of each pixel
depth = 9.0 * 0.54 / (disparity + 0.00001)
depth = baseline * focal_length / (disparity + epsilon)
import os
from PIL import Image
from sklearn.neighbors import NearestNeighbors

# Define the path to the dataset directory
dataset_dir = './dataset'

# Load the images in the dataset
images = []
for filename in os.listdir(dataset_dir):
# Load the image
image = Image.open(os.path.join(dataset_dir, filename))

# Convert the image to grayscale and resize it to a fixed size
image = image.convert('L').resize((128, 128))

# Add the image to the list of images
images.append(image)

# Flatten the images into a single array
flattened_images = np.stack([np.array(image) for image in images]).reshape(len(images), -1)

# Compute the nearest neighbors of the flattened images
nbrs = NearestNeighbors(n_neighbors=5).fit(flattened_images)
distances, indices = nbrs.kneighbors(flattened_images)

# Compute the mean and standard deviation of the distances to the nearest neighbors
mean_distance = np.mean(distances[:, 1:])
std_distance = np.std(distances[:, 1:])

print('Mean distance to nearest neighbor:', mean_distance)
print('Standard deviation of distances:', std_distance)
# Import necessary libraries
import numpy as np
from scipy.stats import ttest_ind
from sklearn.metrics import mean_squared_error

# Load the original and new datasets
original_data = ...
new_data = ...

# Calculate the mean squared error between the original and new images
mse = mean_squared_error(original_data, new_data)

# Use a t-test to compare the means of the original and new datasets
t, p = ttest_ind(original_data, new_data)

# Print the results
print("Mean squared error:", mse)
print("T-test results: t =", t, "p =", p)

# If the mean squared error is large and the t-test p-value is small,
# this indicates a significant distribution shift in the data

Why ChatGPT empowers CV engineers

Just for fun

                   __          __
/ \ / \
/ /\ \ / /\ \
/ / /\ \ / / /\ \
/ / /__\ \ / / /__\ \
/ / /____\ \/ / /____\ \
/ / /______\ \/ /______\ \
/ / / \/ / \ \
/ / / \/ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \
/ / / \ \ \

FiftyOne Computer Vision toolset

--

--

Developer tools for machine learning. Learn more at https://voxel51.com

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Jacob Marks

Machine Learning Engineer and Developer Evangelist @ Voxel51 | Stanford Theoretical Physics PhD | Ex-Google X https://www.linkedin.com/in/jacob-marks