Skip to main content

9 posts tagged with "vision"

View All Tags

· 7 min read
Sparsh Agarwal

/img/content-blog-raw-blog-detectron-2-untitled.png

Introduction

Detectron 2 is a next-generation open-source object detection system from Facebook AI Research. With the repo you can use and train the various state-of-the-art models for detection tasks such as bounding-box detection, instance and semantic segmentation, and person keypoint detection.

The following is the directory tree of detectron 2:

detectron2
├─checkpoint <- checkpointer and model catalog handlers
├─config <- default configs and handlers
├─data <- dataset handlers and data loaders
├─engine <- predictor and trainer engines
├─evaluation <- evaluator for each dataset
├─export <- converter of detectron2 models to caffe2 (ONNX)
├─layers <- custom layers e.g. deformable conv.
├─model_zoo <- pre-trained model links and handler
├─modeling
│ ├─meta_arch <- meta architecture e.g. R-CNN, RetinaNet
│ ├─backbone <- backbone network e.g. ResNet, FPN
│ ├─proposal_generator <- region proposal network
│ └─roi_heads <- head networks for pooled ROIs e.g. box, mask heads
├─solver <- optimizer and scheduler builders
├─structures <- structure classes e.g. Boxes, Instances, etc
└─utils <- utility modules e.g. visualizer, logger, etc

Installation

%%time
!pip install -U torch==1.4+cu100 torchvision==0.5+cu100 -f https://download.pytorch.org/whl/torch_stable.html;
!pip install cython pyyaml==5.1;
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI';
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu100/index.html;

from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

Inference on pre-trained models

Original image

Original image

Object detection with Faster-RCNN-101

Object detection with Faster-RCNN-101

Instance segmentation with Mask-RCNN-50

Instance segmentation with Mask-RCNN-50

Keypoint estimation with Keypoint-RCNN-50

Keypoint estimation with Keypoint-RCNN-50

Panoptic segmentation with Panoptic-FPN-101

Panoptic segmentation with Panoptic-FPN-101

Default Mask R-CNN (top) vs. Mask R-CNN with PointRend (bottom) comparison

Default Mask R-CNN (top) vs. Mask R-CNN with PointRend (bottom) comparison

Fine-tuning Balloons Dataset

Load the data

# download, decompress the data
!wget https://github.com/matterport/Mask_RCNN/releases/download/v2.1/balloon_dataset.zip
!unzip balloon_dataset.zip > /dev/null

Convert dataset into Detectron2's standard format

from detectron2.structures import BoxMode
# write a function that loads the dataset into detectron2's standard format
def get_balloon_dicts(img_dir):
json_file = os.path.join(img_dir, "via_region_data.json")
with open(json_file) as f:
imgs_anns = json.load(f)

dataset_dicts = []
for _, v in imgs_anns.items():
record = {}

filename = os.path.join(img_dir, v["filename"])
height, width = cv2.imread(filename).shape[:2]

record["file_name"] = filename
record["height"] = height
record["width"] = width

annos = v["regions"]
objs = []
for _, anno in annos.items():
assert not anno["region_attributes"]
anno = anno["shape_attributes"]
px = anno["all_points_x"]
py = anno["all_points_y"]
poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)]
poly = list(itertools.chain.from_iterable(poly))

obj = {
"bbox": [np.min(px), np.min(py), np.max(px), np.max(py)],
"bbox_mode": BoxMode.XYXY_ABS,
"segmentation": [poly],
"category_id": 0,
"iscrowd": 0
}
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
return dataset_dicts

from detectron2.data import DatasetCatalog, MetadataCatalog
for d in ["train", "val"]:
DatasetCatalog.register("balloon/" + d, lambda d=d: get_balloon_dicts("balloon/" + d))
MetadataCatalog.get("balloon/" + d).set(thing_classes=["balloon"])
balloon_metadata = MetadataCatalog.get("balloon/train")

Model configuration and training

from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("balloon/train",)
cfg.DATASETS.TEST = () # no metrics implemented for this dataset
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 300 # 300 iterations seems good enough, but you can certainly train longer
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128 # faster, and good enough for this toy dataset
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1 # only has one class (ballon)

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

Inference and Visualization

from detectron2.utils.visualizer import ColorMode

# load weights
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7 # set the testing threshold for this model
# Set training data-set path
cfg.DATASETS.TEST = ("balloon/val", )
# Create predictor (model for inference)
predictor = DefaultPredictor(cfg)

dataset_dicts = get_balloon_dicts("balloon/val")
for d in random.sample(dataset_dicts, 3):
im = cv2.imread(d["file_name"])
outputs = predictor(im)
v = Visualizer(im[:, :, ::-1],
metadata=balloon_metadata,
scale=0.8,
instance_mode=ColorMode.IMAGE_BW # remove the colors of unsegmented pixels
)
v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
cv2_imshow(v.get_image()[:, :, ::-1])

/img/content-blog-raw-blog-detectron-2-untitled-7.png

/img/content-blog-raw-blog-detectron-2-untitled-8.png

/img/content-blog-raw-blog-detectron-2-untitled-9.png

Fine-tuning Chip Dataset

Load the data

#get the dataset
!pip install -q kaggle
!pip install -q kaggle-cli
os.environ['KAGGLE_USERNAME'] = "sparshag"
os.environ['KAGGLE_KEY'] = "1b1f894d1fa6febe9676681b44ad807b"
!kaggle datasets download -d tannergi/microcontroller-detection
!unzip microcontroller-detection.zip

Convert dataset into Detectron2's standard format

# Registering the dataset
from detectron2.structures import BoxMode
def get_microcontroller_dicts(csv_file, img_dir):
df = pd.read_csv(csv_file)
df['filename'] = df['filename'].map(lambda x: img_dir+x)

classes = ['Raspberry_Pi_3', 'Arduino_Nano', 'ESP8266', 'Heltec_ESP32_Lora']

df['class_int'] = df['class'].map(lambda x: classes.index(x))

dataset_dicts = []
for filename in df['filename'].unique().tolist():
record = {}

height, width = cv2.imread(filename).shape[:2]

record["file_name"] = filename
record["height"] = height
record["width"] = width

objs = []
for index, row in df[(df['filename']==filename)].iterrows():
obj= {
'bbox': [row['xmin'], row['ymin'], row['xmax'], row['ymax']],
'bbox_mode': BoxMode.XYXY_ABS,
'category_id': row['class_int'],
"iscrowd": 0
}
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
return dataset_dicts

classes = ['Raspberry_Pi_3', 'Arduino_Nano', 'ESP8266', 'Heltec_ESP32_Lora']
for d in ["train", "test"]:
DatasetCatalog.register('microcontroller/' + d, lambda d=d: get_microcontroller_dicts('Microcontroller Detection/' + d + '_labels.csv', 'Microcontroller Detection/' + d+'/'))
MetadataCatalog.get('microcontroller/' + d).set(thing_classes=classes)
microcontroller_metadata = MetadataCatalog.get('microcontroller/train')

Model configuration and training

# Train the model
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ('microcontroller/train',)
cfg.DATASETS.TEST = () # no metrics implemented for this dataset
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.MAX_ITER = 1000
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

/img/content-blog-raw-blog-detectron-2-untitled-10.png

/img/content-blog-raw-blog-detectron-2-untitled-11.png

Inference and Visualization

cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8 # set the testing threshold for this model
cfg.DATASETS.TEST = ('microcontroller/test', )
predictor = DefaultPredictor(cfg)

df_test = pd.read_csv('Microcontroller Detection/test_labels.csv')

dataset_dicts = DatasetCatalog.get('microcontroller/test')
for d in random.sample(dataset_dicts, 3):
im = cv2.imread(d["file_name"])
outputs = predictor(im)
v = Visualizer(im[:, :, ::-1],
metadata=microcontroller_metadata,
scale=0.8
)
v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
cv2_imshow(v.get_image()[:, :, ::-1])

Real-time Webcam inference

from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
js = Javascript('''
async function takePhoto(quality) {
const div = document.createElement('div');
const capture = document.createElement('button');
capture.textContent = 'Capture';
div.appendChild(capture);

const video = document.createElement('video');
video.style.display = 'block';
const stream = await navigator.mediaDevices.getUserMedia({video: true});

document.body.appendChild(div);
div.appendChild(video);
video.srcObject = stream;
await video.play();

// Resize the output to fit the video element.
google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

// Wait for Capture to be clicked.
await new Promise((resolve) => capture.onclick = resolve);

const canvas = document.createElement('canvas');
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
canvas.getContext('2d').drawImage(video, 0, 0);
stream.getVideoTracks()[0].stop();
div.remove();
return canvas.toDataURL('image/jpeg', quality);
}
''')
display(js)
data = eval_js('takePhoto({})'.format(quality))
binary = b64decode(data.split(',')[1])
with open(filename, 'wb') as f:
f.write(binary)
return filename

from IPython.display import Image
try:
filename = take_photo()
print('Saved to {}'.format(filename))

# Show the image which was just taken.
display(Image(filename))
except Exception as err:
# Errors will be thrown if the user does not have a webcam or if they do not
# grant the page permission to access it.
print(str(err))
model_path = '/content/output/model_final.pth'
config_path= model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml")

# Create config
cfg = get_cfg()
cfg.merge_from_file(config_path)
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.1
cfg.MODEL.WEIGHTS = model_path

predictor = DefaultPredictor(cfg)

im = cv2.imread('photo.jpg')
outputs = predictor(im)

v = Visualizer(im[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=1.2)
v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
cv2_imshow(v.get_image()[:, :, ::-1])

Fine-tuning on Face dataset

The process is same. Here is the output.

/img/content-blog-raw-blog-detectron-2-untitled-12.png

/img/content-blog-raw-blog-detectron-2-untitled-13.png

/img/content-blog-raw-blog-detectron-2-untitled-14.png

Behind the scenes

/img/content-blog-raw-blog-detectron-2-untitled-15.png

References

· 4 min read
Sparsh Agarwal

/img/content-blog-raw-blog-image-similarity-system-untitled.png

Choice of variables

Image Encoder

We can select any pre-trained image classification model. These models are commonly known as encoders because their job is to encode an image into a feature vector. I analyzed four encoders named 1) MobileNet, 2) EfficientNet, 3) ResNet and 4) BiT. After basic research, I decided to select BiT model because of its performance and state-of-the-art nature. I selected the BiT-M-50x3 variant of model which is of size 748 MB. More details about this architecture can be found on the official page here.

Vector Similarity System

Images are represented in a fixed-length feature vector format. For the given input vector, we need to find the TopK most similar vectors, keeping the memory efficiency and real-time retrival objective in mind. I explored the most popular techniques and listed down five of them: Annoy, Cosine distance, L1 distance, Locally Sensitive Hashing (LSH) and Image Deep Ranking. I selected Annoy because of its fast and efficient nature. More details about Annoy can be found on the official page here.

Dataset

I listed down 3 datasets from Kaggle that were best fitting the criteria of this use case: 1) Fashion Product Images (Small), 2) Food-11 image dataset and 3) Caltech 256 Image Dataset. I selected Fashion dataset and Foods dataset.

Literature review

  • Determining Image similarity with Quasi-Euclidean Metric arxiv
  • CatSIM: A Categorical Image Similarity Metric arxiv
  • Central Similarity Quantization for Efficient Image and Video Retrieval arxiv
  • Improved Deep Hashing with Soft Pairwise Similarity for Multi-label Image Retrieval arxiv
  • Model-based Behavioral Cloning with Future Image Similarity Learning arxiv
  • Why do These Match? Explaining the Behavior of Image Similarity Models arxiv
  • Learning Non-Metric Visual Similarity for Image Retrieval arxiv

Process Flow

Step 1: Data Acquisition

Download the raw image dataset into a directory. Categorize these images into their respective category directories. Make sure that images are of the same type, JPEG recommended. We will also process the metadata and store it in a serialized file, CSV recommended.

Step 2: Encoder Fine-tuning

Download the pre-trained image model and add two additional layers on top of that: the first layer is a feature vector layer and the second layer is the classification layer. We will only train these 2 layers on our data and after training, we will select the feature vector layer as the output of our fine-tuned encoder. After fine-tuning the model, we will save the feature extractor for later use.

Fig: a screenshot of encoder fine-tuning process

Fig: a screenshot of encoder fine-tuning process

Step 3: Image Vectorization

Now, we will use the encoder (prepared in step 2) to encode the images (prepared in step 1). We will save feature vector of each image as an array in a directory. After processing, we will save these embeddings for later use.

Step 4: Metadata and Indexing

We will assign a unique id to each image and create dictionaries to locate information of this image: 1) Image id to Image name dictionary, 2) Image id to image feature vector dictionary, and 3) (optional) Image id to metadata product id dictionary. We will also create an image id to image feature vector indexing. Then we will save these dictionaries and index object for later use.

Step 5: API Call

We will receive an image from user, encode it with our image encoder, find TopK similar vectors using Indexing object, and retrieve the image (and metadata) using dictionaries. We send these images (and metadata) back to the user.

Deployment

The API was deployed on AWS cloud infrastructure using AWS Elastic Beanstalk service.

/img/content-blog-raw-blog-image-similarity-system-untitled-2.png

· 4 min read
Sparsh Agarwal

We are going to discuss the following 4 use cases:

  1. Detect faces, eyes, pedestrians, cars, and number plates using OpenCV haar cascade classifiers
  2. Streamlit app for MobileNet SSD Caffe Pre-trained model
  3. Streamlit app for various object detection models and use cases
  4. Detect COCO-80 class objects in videos using TFHub MobileNet SSD model

Use Case 1 - Object detection with OpenCV

Face detection - We will use the frontal face Haar cascade classifier model to detect faces in the given image. The following function first passes the given image into the classifier model to detect a list of face bounding boxes and then runs a loop to draw a red rectangle box around each detected face in the image:

def detect_faces(fix_img):
face_rects = face_classifier.detectMultiScale(fix_img)
for (x, y, w, h) in face_rects:
cv2.rectangle(fix_img,
(x,y),
(x+w, y+h),
(255,0,0),
10)
return fix_img

Eyes detection - The process is almost similar to the face detection process. Instead of frontal face Haar cascade, we will use the eye detection Haar cascade model.

Input image

Input image

detected faces and eyes in the image

detected faces and eyes in the image

Pedestrian detection - We will use the full-body Haar cascade classifier model for pedestrian detection. We will apply this model to a video this time. The following function will run the model on each frame of the video to detect the pedestrians:

# While Loop
while cap.isOpened():
# Read the capture
ret, frame = cap.read()
# Pass the Frame to the Classifier
bodies = body_classifier.detectMultiScale(frame, 1.2, 3)
# if Statement
if ret == True:
# Bound Boxes to Identified Bodies
for (x,y,w,h) in bodies:
cv2.rectangle(frame,
(x,y),
(x+w, y+h),
(25,125,225),
5)
cv2.imshow('Pedestrians', frame)
# Exit with Esc button
if cv2.waitKey(1) == 27:
break
# else Statement
else:
break

# Release the Capture & Destroy All Windows
cap.release()
cv2.destroyAllWindows()

Car detection - The process is almost similar to the pedestrian detection process. Again, we will use this model on a video. Instead of people Haar cascade, we will use the car cascade model.

Car number plate detection - The process is almost similar to the face and eye detection process. We will use the car number plate cascade model.

You can find the code here on Github.

Use Case 2 - MobileNet SSD Caffe Pre-trained model

You can play with the live app here. Souce code is available here on Github.

Use Case 3 - YOLO Object Detection App

You can play with the live app *here. Source code is available here on Github.*

This app can detect COCO 80-classes using three different models - Caffe MobileNet SSD, Yolo3-tiny, and Yolo3. It can also detect faces using two different models - SSD Res10 and OpenCV face detector. Yolo3-tiny can also detect fires.

/img/content-blog-raw-blog-object-detection-with-yolo3-untitled.png

/img/content-blog-raw-blog-object-detection-with-yolo3-untitled-1.png

Use Case 4 - TFHub MobileNet SSD on Videos

In this section, we will use the MobileNet SSD object detection model from TFHub. We will apply it to videos. We can load the model using the following command:

module_handle = "https://tfhub.dev/google/openimages_v4/ssd/mobilenet_v2/1"
detector = hub.load(module_handle).signatures['default']

After loading the model, we will capture frames using OpenCV video capture method, and pass each frame through the detection model:

cap = cv2.VideoCapture('/content/Spectre_opening_highest_for_a_James_Bond_film_in_India.mp4')
for i in range(1,total_frames,200):
cap.set(cv2.CAP_PROP_POS_FRAMES,i)
ret,frame = cap.read()
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
run_detector(detector,frame)

Here are some detected objects in frames:

/img/content-blog-raw-blog-object-detection-hands-on-exercises-untitled.png

/img/content-blog-raw-blog-object-detection-hands-on-exercises-untitled-1.png

/img/content-blog-raw-blog-object-detection-hands-on-exercises-untitled-2.png

You can find the code here on Github.


Congrats! In the next post of this series, we will cover 5 exciting use cases - 1) detectron 2 object detection fine-tuning on custom class, 2) Tensorflow Object detection API inference, fine-tuning, and few-shot learning, 3) Inference with 6 pre-trained models, 4) Mask R-CNN object detection app, and 5) Logo detection app deployment as a Rest API using AWS elastic Beanstalk.

· 2 min read
Sparsh Agarwal

Face detection

We will use the frontal face Haar cascade classifier model to detect faces in the given image. The following function first passes the given image into the classifier model to detect a list of face bounding boxes and then runs a loop to draw a red rectangle box around each detected face in the image:

def detect_faces(fix_img):
face_rects = face_classifier.detectMultiScale(fix_img)
for (x, y, w, h) in face_rects:
cv2.rectangle(fix_img,
(x,y),
(x+w, y+h),
(255,0,0),
10)
return fix_img

Eyes detection

The process is almost similar to the face detection process. Instead of frontal face Haar cascade, we will use the eye detection Haar cascade model.

Input image

Input image

detected faces and eyes in the image

detected faces and eyes in the image

Pedestrian detection

We will use the full-body Haar cascade classifier model for pedestrian detection. We will apply this model to a video this time. The following function will run the model on each frame of the video to detect the pedestrians:

# While Loop
while cap.isOpened():
# Read the capture
ret, frame = cap.read()
# Pass the Frame to the Classifier
bodies = body_classifier.detectMultiScale(frame, 1.2, 3)
# if Statement
if ret == True:
# Bound Boxes to Identified Bodies
for (x,y,w,h) in bodies:
cv2.rectangle(frame,
(x,y),
(x+w, y+h),
(25,125,225),
5)
cv2.imshow('Pedestrians', frame)
# Exit with Esc button
if cv2.waitKey(1) == 27:
break
# else Statement
else:
break

# Release the Capture & Destroy All Windows
cap.release()
cv2.destroyAllWindows()

Car detection

The process is almost similar to the pedestrian detection process. Again, we will use this model on a video. Instead of people Haar cascade, we will use the car cascade model.

Car number plate detection

The process is almost similar to the face and eye detection process. We will use the car number plate cascade model.

You can find the code here on Github.

· 2 min read
Sparsh Agarwal

Live app

This app can detect COCO 80-classes using three different models - Caffe MobileNet SSD, Yolo3-tiny, and Yolo3. It can also detect faces using two different models - SSD Res10 and OpenCV face detector. Yolo3-tiny can also detect fires.

/img/content-blog-raw-blog-object-detection-with-yolo3-untitled.png

/img/content-blog-raw-blog-object-detection-with-yolo3-untitled-1.png

Code

import streamlit as st
import cv2
from PIL import Image
import numpy as np
import os

from tempfile import NamedTemporaryFile
from tensorflow.keras.preprocessing.image import img_to_array, load_img

temp_file = NamedTemporaryFile(delete=False)

DEFAULT_CONFIDENCE_THRESHOLD = 0.5
DEMO_IMAGE = "test_images/demo.jpg"
MODEL = "model/MobileNetSSD_deploy.caffemodel"
PROTOTXT = "model/MobileNetSSD_deploy.prototxt.txt"

CLASSES = [
"background",
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
]
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

@st.cache
def process_image(image):
blob = cv2.dnn.blobFromImage(
cv2.resize(image, (300, 300)), 0.007843, (300, 300), 127.5
)
net = cv2.dnn.readNetFromCaffe(PROTOTXT, MODEL)
net.setInput(blob)
detections = net.forward()
return detections

@st.cache
def annotate_image(
image, detections, confidence_threshold=DEFAULT_CONFIDENCE_THRESHOLD
):
# loop over the detections
(h, w) = image.shape[:2]
labels = []
for i in np.arange(0, detections.shape[2]):
confidence = detections[0, 0, i, 2]

if confidence > confidence_threshold:
# extract the index of the class label from the `detections`,
# then compute the (x, y)-coordinates of the bounding box for
# the object
idx = int(detections[0, 0, i, 1])
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")

# display the prediction
label = f"{CLASSES[idx]}: {round(confidence * 100, 2)}%"
labels.append(label)
cv2.rectangle(image, (startX, startY), (endX, endY), COLORS[idx], 2)
y = startY - 15 if startY - 15 > 15 else startY + 15
cv2.putText(
image, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2
)
return image, labels

def main():
selected_box = st.sidebar.selectbox(
'Choose one of the following',
('Welcome', 'Object Detection')
)

if selected_box == 'Welcome':
welcome()
if selected_box == 'Object Detection':
object_detection()

def welcome():
st.title('Object Detection using Streamlit')
st.subheader('A simple app for object detection')
st.image('test_images/demo.jpg',use_column_width=True)

def object_detection():

st.title("Object detection with MobileNet SSD")

confidence_threshold = st.sidebar.slider(
"Confidence threshold", 0.0, 1.0, DEFAULT_CONFIDENCE_THRESHOLD, 0.05)

st.sidebar.multiselect("Select object classes to include",
options=CLASSES,
default=CLASSES
)

img_file_buffer = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

if img_file_buffer is not None:
temp_file.write(img_file_buffer.getvalue())
image = load_img(temp_file.name)
image = img_to_array(image)
image = image/255.0

else:
demo_image = DEMO_IMAGE
image = np.array(Image.open(demo_image))

detections = process_image(image)
image, labels = annotate_image(image, detections, confidence_threshold)

st.image(
image, caption=f"Processed image", use_column_width=True,
)

st.write(labels)

main()

You can play with the live app *here. Source code is available here on Github.*

· 2 min read
Sparsh Agarwal

/img/content-blog-raw-blog-ocr-experiments-untitled.png

1. Tesseract

Tesseract is an open-source text recognition engine that is available under the Apache 2.0 license and its development has been sponsored by Google since 2006.

Notebook on nbviewer

2. EasyOCR

Ready-to-use OCR with 70+ languages supported including Chinese, Japanese, Korean and Thai. EasyOCR is built with Python and Pytorch deep learning library, having a GPU could speed up the whole process of detection. The detection part is using the CRAFT algorithm and the Recognition model is CRNN. It is composed of 3 main components, feature extraction (we are currently using Resnet), sequence labelling (LSTM) and decoding (CTC). EasyOCR doesn’t have much software dependencies, it can directly be used with its API.

Notebook on nbviewer

3. KerasOCR

This is a slightly polished and packaged version of the Keras CRNN implementation and the published CRAFT text detection model. It provides a high-level API for training a text detection and OCR pipeline and out-of-the-box OCR models, and an end-to-end training pipeline to build new OCR models.

Notebook on nbviewer

4. ArabicOCR

It is an OCR system for the Arabic language that converts images of typed text to machine-encoded text. It currently supports only letters (29 letters). ArabicOCR aims to solve a simpler problem of OCR with images that contain only Arabic characters (check the dataset link below to see a sample of the images).

Notebook on nbviewer

· 14 min read
Sparsh Agarwal

/img/content-blog-raw-blog-vehicle-suggestions-untitled.png

Introduction

The customer owns a franchise store for selling Tesla Automobiles. The objective is to predict user preferences using social media data.

Task 1 - Suggest the best vehicle for the given description

Task 2 - Suggest the best vehicle for the given social media id of the user

Customer queries

// car or truck or no mention of vehicle type means Cyber Truck
// SUV mention means Model X
const one = "I'm looking for a fast suv that I can go camping without worrying about recharging".;
const two = "cheap red car that is able to go long distances";
const three = "i am looking for a daily driver that i can charge everyday, do not need any extras";
const four = "i like to go offroading a lot on my jeep and i want to do the same with the truck";
const five = "i want the most basic suv possible";
const six = "I want all of the addons";
// mentions of large family or many people means model x
const seven = "I have a big family and want to be able to take them around town and run errands without worrying about charging";
  • Expected output
    const oneJson = {
    vehicle: 'Model X',
    trim : 'adventure',
    exteriorColor: 'whiteExterior',
    wheels: "22Performance",
    tonneau: "powerTonneau",
    packages: "",
    interiorAddons: "",
    interiorColor: "blackInterior",
    range: "extendedRange",
    software: "",
    }

    const twoJSON = {
    vehicle: 'Cyber Truck',
    trim : 'base',
    exteriorColor: 'whiteExterior',
    wheels: "21AllSeason",
    tonneau: "powerTonneau",
    packages: "",
    interiorAddons: "",
    interiorColor: "blackInterior",
    range: "extendedRange",
    software: "",
    }

    const threeJSON = {
    vehicle: 'Cyber Truck',
    trim : 'base',
    exteriorColor: 'whiteExterior',
    wheels: "21AllSeason",
    tonneau: "powerTonneau",
    packages: "",
    interiorAddons: "",
    interiorColor: "blackInterior",
    range: "standardRange",
    software: "",
    }

    const fourJSON = {
    vehicle: 'Cyber Truck',
    trim : 'adventure',
    exteriorColor: 'whiteExterior',
    wheels: "20AllTerrain",
    tonneau: "powerTonneau",
    packages: "offroadPackage,matchingSpareTire",
    interiorAddons: "",
    interiorColor: "blackInterior",
    range: "extendedRange",
    software: "",
    }

    const fiveJSON = {
    vehicle: 'Model X',
    trim : 'base',
    exteriorColor: 'whiteExterior',
    wheels: "20AllTerrain",
    tonneau: "manualTonneau",
    packages: "",
    interiorAddons: "",
    interiorColor: "blackInterior",
    range: "standardRange",
    software: "",
    }

    const sixJSON = {
    vehicle: 'Cyber Truck',
    trim : 'adventure',
    exteriorColor: 'whiteExterior',
    wheels: "20AllTerrain",
    tonneau: "powerTonneau",
    packages: "offroadPackage,matchingSpareTire",
    interiorAddons: "wirelessCharger",
    interiorColor: "blackInterior",
    range: "extendedRange",
    software: "selfDrivingPackage",
    }

    const sevenJSON = {
    vehicle: 'Model X',
    trim : 'base',
    exteriorColor: 'whiteExterior',
    wheels: "21AllSeason",
    tonneau: "powerTonneau",
    packages: "",
    interiorAddons: "",
    interiorColor: "blackInterior",
    range: "mediumRange",
    software: "",
    }
  • Vehicle model configurations
    const configuration = {
    meta: {
    configurationId: '???',
    storeId: 'US_SALES',
    country: 'US',
    version: '1.0',
    effectiveDate: '???',
    currency: 'USD',
    locale: 'en-US',
    availableLocales: ['en-US'],
    },

    defaults: {
    basePrice: 50000,
    deposit: 1000,
    initialSelection: [
    'adventure',
    'whiteExterior',
    '21AllSeason',
    'powerTonneau',
    'blackInterior',
    'mediumRange',
    ],
    },

    groups: {
    trim: {
    name: { 'en-US': 'Choose trim' },
    multiselect: false,
    required: true,
    options: ['base', 'adventure'],
    },
    exteriorColor: {
    name: { 'en-US': 'Choose paint' },
    multiselect: false,
    required: true,
    options: [
    'whiteExterior',
    'blueExterior',
    'silverExterior',
    'greyExterior',
    'blackExterior',
    'redExterior',
    'greenExterior',
    ],
    },
    wheels: {
    name: { 'en-US': 'Choose wheels' },
    multiselect: false,
    required: true,
    options: ['21AllSeason', '20AllTerrain', '22Performance'],
    },
    tonneau: {
    name: { 'en-US': 'Choose tonneau cover' },
    multiselect: false,
    required: true,
    options: ['manualTonneau', 'powerTonneau'],
    },
    packages: {
    name: { 'en-US': 'Choose upgrades' },
    multiselect: true,
    required: false,
    options: ['offroadPackage', 'matchingSpareTire'],
    },
    interiorColor: {
    name: { 'en-US': 'Choose interior' },
    multiselect: false,
    required: true,
    options: ['greyInterior', 'blackInterior', 'greenInterior'],
    },
    interiorAddons: {
    name: { 'en-US': 'Choose upgrade' },
    multiselect: true,
    required: false,
    options: ['wirelessCharger'],
    },
    range: {
    name: { 'en-US': 'Choose range' },
    multiselect: false,
    required: true,
    options: ['standardRange', 'mediumRange', 'extendedRange'],
    },
    software: {
    name: { 'en-US': 'Choose upgrade' },
    multiselect: true,
    required: false,
    options: ['selfDrivingPackage'],
    },
    specs: {
    name: { 'en-US': 'Specs overview *' },
    attrs: {
    description: {
    'en-US':
    "* Options, specs and pricing may change as we approach production. We'll contact you to review any updates to your preferred build.",
    },
    },
    multiselect: false,
    required: false,
    options: ['acceleration', 'power', 'towing', 'range'],
    },
    },

    options: {
    base: {
    name: { 'en-US': 'Base' },
    attrs: {
    description: { 'en-US': 'Production begins 2022' },
    },
    visual: true,
    price: 0,
    },
    adventure: {
    name: { 'en-US': 'Adventure' },
    attrs: {
    description: { 'en-US': 'Production begins 2021' },
    },
    visual: true,
    price: 10000,
    },

    standardRange: {
    name: { 'en-US': 'Standard' },
    attrs: {
    description: { 'en-US': '230+ miles' },
    },
    price: 0,
    },
    mediumRange: {
    name: { 'en-US': 'Medium' },
    attrs: {
    description: { 'en-US': '300+ miles' },
    },
    price: 3000,
    },
    extendedRange: {
    name: { 'en-US': 'Extended' },
    attrs: {
    description: { 'en-US': '400+ miles' },
    },
    price: 8000,
    },

    greenExterior: {
    name: { 'en-US': 'Adirondack Green' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/green.svg',
    },
    visual: true,
    price: 2000,
    },
    blueExterior: {
    name: { 'en-US': 'Trestles Blue' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/blue.svg',
    },
    visual: true,
    price: 1000,
    },
    whiteExterior: {
    name: { 'en-US': 'Arctic White' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/white.svg',
    },
    visual: true,
    price: 0,
    },
    silverExterior: {
    name: { 'en-US': 'Silver Gracier' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/silver.svg',
    },
    visual: true,
    price: 1000,
    },
    blackExterior: {
    name: { 'en-US': 'Cosmic Black' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/black.svg',
    },
    visual: true,
    price: 1000,
    },
    redExterior: {
    name: { 'en-US': 'Red Rocks' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/red.svg',
    },
    visual: true,
    price: 2000,
    },
    greyExterior: {
    name: { 'en-US': 'Antracite Grey' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/exteriorcolors/grey.svg',
    },
    visual: true,
    price: 1000,
    },

    '21AllSeason': {
    name: { 'en-US': '21" Cast Wheel - All Season' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/wheels/twentyone.svg',
    },
    visual: true,
    price: 0,
    },
    '20AllTerrain': {
    name: { 'en-US': '20" Forged Wheel - All Terrain' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/wheels/twenty.svg',
    },
    visual: true,
    price: 0,
    },
    '22Performance': {
    name: { 'en-US': '22" Cast Wheel - Performance' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/wheels/twentytwo.svg',
    },
    visual: true,
    price: 2000,
    },

    manualTonneau: {
    name: { 'en-US': 'Manual' },
    attrs: {
    description: { 'en-US': 'Description here' },
    },
    price: 0,
    },
    powerTonneau: {
    name: { 'en-US': 'Powered' },
    attrs: {
    description: { 'en-US': 'Description here' },
    },
    price: 0,
    },

    blackInterior: {
    name: { 'en-US': 'Black' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/interiorcolors/black.svg',
    },
    visual: true,
    price: 0,
    },
    greyInterior: {
    name: { 'en-US': 'Grey' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/interiorcolors/grey.svg',
    },
    visual: true,
    price: 1000,
    },
    greenInterior: {
    name: { 'en-US': 'Green' },
    attrs: {
    imageUrl: '/public/images/configurationOptions/interiorcolors/green.svg',
    },
    visual: true,
    price: 2000,
    },

    offroadPackage: {
    name: { 'en-US': 'Off-Road' },
    attrs: {
    description: { 'en-US': 'Lorem ipsum dolor sit amet.' },
    imageUrl: '/public/images/configurationOptions/packages/offroad.png',
    },
    visual: true,
    price: 5000,
    },
    matchingSpareTire: {
    name: { 'en-US': 'Matching Spare Tire' },
    attrs: {
    description: { 'en-US': 'Full sized tire' },
    imageUrl: '/public/images/configurationOptions/packages/spare.png',
    },
    price: 500,
    },

    wirelessCharger: {
    name: { 'en-US': 'Wireless charger' },
    attrs: {
    description: { 'en-US': 'Lorem ipsum dolor sit amet.' },
    imageUrl: '/public/images/configurationOptions/packages/wireless.png',
    },
    price: 100,
    },
    selfDrivingPackage: {
    name: { 'en-US': 'Autonomy' },
    attrs: {
    description: { 'en-US': 'Lorem ipsum dolor sit amet.' },
    imageUrl: '/public/images/configurationOptions/packages/autonomy.png',
    },
    price: 7000,
    },

    acceleration: {
    name: { 'en-US': '0 - 60 mph' },
    attrs: {
    units: { 'en-US': 'sec' },
    decimals: 1,
    },
    value: 3.4,
    },
    power: {
    name: { 'en-US': 'Horsepower' },
    attrs: {
    units: { 'en-US': 'hp' },
    },
    value: 750,
    },
    towing: {
    name: { 'en-US': 'Towing' },
    attrs: {
    units: { 'en-US': 'lbs' },
    },
    value: 10000,
    },
    range: {
    name: { 'en-US': 'Range' },
    attrs: {
    units: { 'en-US': 'mi' },
    },
    value: 400,
    },
    }
    };

Public datasets

  • Instagram: 16539 images from 972 Instagram influencers (link)
  • TechCrunchPosts: (link)
  • Tweets: (link)

Primary (available for academic use only, need university affiliation for access)

Secondary (low quality data, not sure if can be used at all)

Logical Reasoning

  • If I implicitly rate pictures of blue car, that means I might prefer a blue car.
  • If I like posts of self-driving, that means I might prefer a self-driving option.

Scope

Scope 1

/img/content-blog-raw-blog-vehicle-suggestions-untitled-2.png

Scope 2

media content categories: text and images

platforms: facebook, twitter and instagram

implicit rating categories: like, comment, share

columns: userid, timestamp, platform, type, content, rating

Model Framework

Model framework 1

  1. Convert user's natural language query into vector using Universal Sentence Embedding model
  2. Create a product specs binary matrix based on different categories
  3. Find TopK similar query vectors using cosine distance
  4. For each TopK vector, Find TopM product specs using interaction table weights
  5. For each TopM specification, find TopN similar specs using binary matrix
  6. Show all the qualified product specifications

Model framework 2

  1. Seed data: 10 users with ground-truth persona, media content and implicit ratings
  2. Inflated data: 10 users with media content and implicit ratings
  3. media content → Implicit rating (A)
  4. media content → feature vector (B) + (A) → weighted pooling → similar users (C)
  5. media content → QA model → slot filling → global pooling → item associations (D)
  6. (C) → content-based filtering → item recommendations → (D) → top-k recommendations

User selection

Model framework 3

User-User Similarity (clustering)

  • User → Media content → Embedding → Average pooling
  • Cosine Similarity of user's social vector with other user's social vector

User-Item Similarity (reranking)

  • User → Implicit Rating on media content M → M's correlation with item features
  • Item features: familySize
  • Cosine Similarity of user's social vector with item's feature vector

User-User Similarity (clustering)

  • User → Media content → Embedding → Average pooling
  • Cosine Similarity of user's social vector with other user's social vector

User-Item Similarity (reranking)

  • User → Implicit Rating on media content M → M's correlation with item features
  • Item features: familySize
  • Cosine Similarity of user's social vector with item's feature vector

Model framework 4

/img/content-blog-raw-blog-vehicle-suggestions-untitled-3.png

Text → Prepare → Vectorize → Average → Similar Users

Image → Prepare → Vectorize → Average → Similar Users

Text → Prepare → QA → Slot filling

Image → Prepare → VQA → Slot filling

Image → Similar Image from users → Detailed enquiry

Model framework 5

  1. Topic Clusters Text
  2. Topic Clusters Image
  3. Fetch raw text and images
  4. Combine, Clean and Store text in text dataframe
  5. Vectorize Texts
  6. Cosine similarities of texts with topic clusters
  7. Vectorize Images
  8. Cosine similarities of images with topic clusters

Experimental Setup

  • Experiment 1
    import numpy as np
    import pandas as pd
    import tensorflow_hub as hub
    from itertools import product
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.metrics.pairwise import cosine_similarity

    vehicle = ['modelX', 'cyberTruck']
    trim = ['adventure', 'base']
    exteriorColor = ['whiteExterior', 'blueExterior', 'silverExterior', 'greyExterior', 'blackExterior', 'redExterior', 'greenExterior']
    wheels = ['20AllTerrain', '21AllSeason', '22Performance']
    tonneau = ['powerTonneau', 'manualTonneau']
    interiorColor = ['blackInterior', 'greyInterior', 'greenInterior']
    range = ['standardRange', 'mediumRange', 'extendedRange']
    packages = ['offroadPackage', 'matchingSpareTire', 'offroadPackage,matchingSpareTire', 'None']
    interiorAddons = ['wirelessCharger', 'None']
    software = ['selfDrivingPackage', 'None']

    specs_cols = ['vehicle', 'trim', 'exteriorColor', 'wheels', 'tonneau', 'interiorColor', 'range', 'packages', 'interiorAddons', 'software']
    specs = pd.DataFrame(list(product(vehicle, trim, exteriorColor, wheels, tonneau, interiorColor, range, packages, interiorAddons, software)),
    columns=specs_cols)

    enc = OneHotEncoder(handle_unknown='error', sparse=False)
    specs = pd.DataFrame(enc.fit_transform(specs))

    specs_ids = specs.index.tolist()

    query_list = ["I'm looking for a fast suv that I can go camping without worrying about recharging",
    "cheap red car that is able to go long distances",
    "i am looking for a daily driver that i can charge everyday, do not need any extras",
    "i like to go offroading a lot on my jeep and i want to do the same with the truck",
    "i want the most basic suv possible",
    "I want all of the addons",
    "I have a big family and want to be able to take them around town and run errands without worrying about charging"]

    queries = pd.DataFrame(query_list, columns=['query'])
    query_ids = queries.index.tolist()

    const_oneJSON = {
    'vehicle': 'modelX',
    'trim' : 'adventure',
    'exteriorColor': 'whiteExterior',
    'wheels': "22Performance",
    'tonneau': "powerTonneau",
    'packages': "None",
    'interiorAddons': "None",
    'interiorColor': "blackInterior",
    'range': "extendedRange",
    'software': "None",
    }

    const_twoJSON = {
    'vehicle': 'cyberTruck',
    'trim' : 'base',
    'exteriorColor': 'whiteExterior',
    'wheels': "21AllSeason",
    'tonneau': "powerTonneau",
    'packages': "None",
    'interiorAddons': "None",
    'interiorColor': "blackInterior",
    'range': "extendedRange",
    'software': "None",
    }

    const_threeJSON = {
    'vehicle': 'cyberTruck',
    'trim' : 'base',
    'exteriorColor': 'whiteExterior',
    'wheels': "21AllSeason",
    'tonneau': "powerTonneau",
    'packages': "None",
    'interiorAddons': "None",
    'interiorColor': "blackInterior",
    'range': "standardRange",
    'software': "None",
    }

    const_fourJSON = {
    'vehicle': 'cyberTruck',
    'trim' : 'adventure',
    'exteriorColor': 'whiteExterior',
    'wheels': "20AllTerrain",
    'tonneau': "powerTonneau",
    'packages': "offroadPackage,matchingSpareTire",
    'interiorAddons': "None",
    'interiorColor': "blackInterior",
    'range': "extendedRange",
    'software': "None",
    }

    const_fiveJSON = {
    'vehicle': 'modelX',
    'trim' : 'base',
    'exteriorColor': 'whiteExterior',
    'wheels': "20AllTerrain",
    'tonneau': "manualTonneau",
    'packages': "None",
    'interiorAddons': "None",
    'interiorColor': "blackInterior",
    'range': "standardRange",
    'software': "None",
    }

    const_sixJSON = {
    'vehicle': 'cyberTruck',
    'trim' : 'adventure',
    'exteriorColor': 'whiteExterior',
    'wheels': "20AllTerrain",
    'tonneau': "powerTonneau",
    'packages': "offroadPackage,matchingSpareTire",
    'interiorAddons': "wirelessCharger",
    'interiorColor': "blackInterior",
    'range': "extendedRange",
    'software': "selfDrivingPackage",
    }

    const_sevenJSON = {
    'vehicle': 'modelX',
    'trim' : 'base',
    'exteriorColor': 'whiteExterior',
    'wheels': "21AllSeason",
    'tonneau': "powerTonneau",
    'packages': "None",
    'interiorAddons': "None",
    'interiorColor': "blackInterior",
    'range': "mediumRange",
    'software': "None",
    }

    historical_data = pd.DataFrame([const_oneJSON, const_twoJSON, const_threeJSON, const_fourJSON, const_fiveJSON, const_sixJSON, const_sevenJSON])

    input_vec = enc.transform([specs_frame.append(historical_data.iloc[0], sort=False).iloc[-1]])
    idx = np.argsort(-cosine_similarity(input_vec, specs.values))[0,:][:1]
    rslt = enc.inverse_transform([specs.iloc[idx]])

    interactions = pd.DataFrame(columns=['query_id','specs_id'])
    interactions['query_id'] = queries.index.tolist()
    input_vecs = enc.transform(specs_frame.append(historical_data, sort=False).iloc[-len(historical_data):])
    interactions['specs_id'] = np.argsort(-cosine_similarity(input_vecs, specs.values))[:,0]

    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed_model = hub.load(module_url)
    def embed(input):
    return embed_model(input)
    query_vecs = embed(queries['query'].tolist()).numpy()

    _query = input('Please enter query: ') or 'i want the most basic suv possible'
    _query_vec = embed([_query]).numpy()
    _match_qid = np.argsort(-cosine_similarity(_query_vec, query_vecs))[0,:][:1]
    _match_sid = interactions.loc[interactions['query_id']==_match_qid[0], 'specs_id'].values[0]
    input_vec = enc.transform([specs_frame.append(historical_data.iloc[0], sort=False).iloc[-1]])
    idx = np.argsort(-cosine_similarity([specs.iloc[_match_sid].values], specs.values))[0,:][:5]
    results = []
    for x in idx:
    results.append(enc.inverse_transform([specs.iloc[x]]))
    _temp = np.array(results).reshape(5,-1)
    _temp = pd.DataFrame(_temp, columns=specs_frame.columns)
    print(_temp)

Experiment 2

Celeb Scraping

Facebook Scraping

/img/content-blog-raw-blog-vehicle-suggestions-untitled-4.png

Twitter Scraping

/img/content-blog-raw-blog-vehicle-suggestions-untitled-5.png

Dataframe

/img/content-blog-raw-blog-vehicle-suggestions-untitled-6.png

Insta Image Grid

/img/content-blog-raw-blog-vehicle-suggestions-untitled-7.png

User Text NER

/img/content-blog-raw-blog-vehicle-suggestions-untitled-8.png

Experiment 3

Topic model

Topic scores

/img/content-blog-raw-blog-vehicle-suggestions-untitled-9.png

JSON rules

/img/content-blog-raw-blog-vehicle-suggestions-untitled-10.png

Results and Discussion

  • API with 3 input fields - Facebook username, Twitter handle & Instagram username
  • The system will automatically scrap the user's publicly available text and images from these 3 social media platforms and provide a list of recommendations from most to least preferred product

· 2 min read
Sparsh Agarwal

Live app

This app can detect COCO 80-classes using three different models - Caffe MobileNet SSD, Yolo3-tiny, and Yolo3. It can also detect faces using two different models - SSD Res10 and OpenCV face detector. Yolo3-tiny can also detect fires.

/img/content-blog-raw-blog-object-detection-with-yolo3-untitled.png

/img/content-blog-raw-blog-object-detection-with-yolo3-untitled-1.png

Code

import streamlit as st
import cv2
from PIL import Image
import numpy as np
import os

from tempfile import NamedTemporaryFile
from tensorflow.keras.preprocessing.image import img_to_array, load_img

temp_file = NamedTemporaryFile(delete=False)

DEFAULT_CONFIDENCE_THRESHOLD = 0.5
DEMO_IMAGE = "test_images/demo.jpg"
MODEL = "model/MobileNetSSD_deploy.caffemodel"
PROTOTXT = "model/MobileNetSSD_deploy.prototxt.txt"

CLASSES = [
"background",
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
]
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

@st.cache
def process_image(image):
blob = cv2.dnn.blobFromImage(
cv2.resize(image, (300, 300)), 0.007843, (300, 300), 127.5
)
net = cv2.dnn.readNetFromCaffe(PROTOTXT, MODEL)
net.setInput(blob)
detections = net.forward()
return detections

@st.cache
def annotate_image(
image, detections, confidence_threshold=DEFAULT_CONFIDENCE_THRESHOLD
):
# loop over the detections
(h, w) = image.shape[:2]
labels = []
for i in np.arange(0, detections.shape[2]):
confidence = detections[0, 0, i, 2]

if confidence > confidence_threshold:
# extract the index of the class label from the `detections`,
# then compute the (x, y)-coordinates of the bounding box for
# the object
idx = int(detections[0, 0, i, 1])
box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
(startX, startY, endX, endY) = box.astype("int")

# display the prediction
label = f"{CLASSES[idx]}: {round(confidence * 100, 2)}%"
labels.append(label)
cv2.rectangle(image, (startX, startY), (endX, endY), COLORS[idx], 2)
y = startY - 15 if startY - 15 > 15 else startY + 15
cv2.putText(
image, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2
)
return image, labels

def main():
selected_box = st.sidebar.selectbox(
'Choose one of the following',
('Welcome', 'Object Detection')
)

if selected_box == 'Welcome':
welcome()
if selected_box == 'Object Detection':
object_detection()

def welcome():
st.title('Object Detection using Streamlit')
st.subheader('A simple app for object detection')
st.image('test_images/demo.jpg',use_column_width=True)

def object_detection():

st.title("Object detection with MobileNet SSD")

confidence_threshold = st.sidebar.slider(
"Confidence threshold", 0.0, 1.0, DEFAULT_CONFIDENCE_THRESHOLD, 0.05)

st.sidebar.multiselect("Select object classes to include",
options=CLASSES,
default=CLASSES
)

img_file_buffer = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

if img_file_buffer is not None:
temp_file.write(img_file_buffer.getvalue())
image = load_img(temp_file.name)
image = img_to_array(image)
image = image/255.0

else:
demo_image = DEMO_IMAGE
image = np.array(Image.open(demo_image))

detections = process_image(image)
image, labels = annotate_image(image, detections, confidence_threshold)

st.image(
image, caption=f"Processed image", use_column_width=True,
)

st.write(labels)

main()

You can play with the live app *here. Source code is available here on Github.*

· One min read
Sparsh Agarwal

You can play with the live app here. Souce code is available here on Github.

Live app

/img/content-blog-raw-mobilenet-ssd-caffe-pre-trained-model-untitled.png

Code

#------------------------------------------------------#
# Import libraries
#------------------------------------------------------#

import datetime
import urllib
import time
import cv2 as cv
import streamlit as st

from plugins import Motion_Detection
from utils import GUI, AppManager, DataManager

#------------------------------------------------------#
#------------------------------------------------------#

def imageWebApp(guiParam):
"""
"""
# Load the image according to the selected option
conf = DataManager(guiParam)
image = conf.load_image_or_video()

# GUI
switchProcessing = st.button('* Start Processing *')

# Apply the selected plugin on the image
bboxed_frame, output = AppManager(guiParam).process(image, True)

# Display results
st.image(bboxed_frame, channels="BGR", use_column_width=True)

def main():
"""
"""
# Get the parameter entered by the user from the GUI
guiParam = GUI().getGuiParameters()

# Check if the application if it is Empty
if guiParam['appType'] == 'Image Applications':
if guiParam["selectedApp"] is not 'Empty':
imageWebApp(guiParam)

else:
raise st.ScriptRunner.StopException

#------------------------------------------------------#
#------------------------------------------------------#

if __name__ == "__main__":
main()