computer-vision

Compare original and translation side by side

🇺🇸

Original

English
🇨🇳

Translation

Chinese

Computer Vision

计算机视觉

Build models to analyze and understand visual data.
构建用于分析和理解视觉数据的模型。

Quick Start

快速入门

Image Classification

图像分类

python
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
python
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

Load pre-trained model

Load pre-trained model

model = models.resnet50(pretrained=True) model.eval()
model = models.resnet50(pretrained=True) model.eval()

Preprocess image

Preprocess image

transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ])
img = Image.open('image.jpg') img_tensor = transform(img).unsqueeze(0)
transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ])
img = Image.open('image.jpg') img_tensor = transform(img).unsqueeze(0)

Predict

Predict

with torch.no_grad(): output = model(img_tensor) probabilities = torch.nn.functional.softmax(output[0], dim=0) top5 = torch.topk(probabilities, 5)
print(top5)
undefined
with torch.no_grad(): output = model(img_tensor) probabilities = torch.nn.functional.softmax(output[0], dim=0) top5 = torch.topk(probabilities, 5)
print(top5)
undefined

Custom CNN

自定义CNN

python
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x
python
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

Data Augmentation

数据增强

python
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])
python
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

Object Detection with YOLO

基于YOLO的目标检测

python
from ultralytics import YOLO
python
from ultralytics import YOLO

Load model

Load model

model = YOLO('yolov8n.pt')
model = YOLO('yolov8n.pt')

Predict

Predict

results = model('image.jpg')
results = model('image.jpg')

Process results

Process results

for result in results: boxes = result.boxes for box in boxes: x1, y1, x2, y2 = box.xyxy[0] confidence = box.conf[0] class_id = box.cls[0] print(f"Class: {class_id}, Confidence: {confidence:.2f}") print(f"Box: ({x1}, {y1}, {x2}, {y2})")
for result in results: boxes = result.boxes for box in boxes: x1, y1, x2, y2 = box.xyxy[0] confidence = box.conf[0] class_id = box.cls[0] print(f"Class: {class_id}, Confidence: {confidence:.2f}") print(f"Box: ({x1}, {y1}, {x2}, {y2})")

Save results

Save results

results[0].save('output.jpg')
undefined
results[0].save('output.jpg')
undefined

Image Segmentation

图像分割

python
undefined
python
undefined

Semantic segmentation with DeepLab

Semantic segmentation with DeepLab

model = torch.hub.load( 'pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True ) model.eval()
model = torch.hub.load( 'pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True ) model.eval()

Preprocess

Preprocess

preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ])
input_tensor = preprocess(img).unsqueeze(0)
preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ])
input_tensor = preprocess(img).unsqueeze(0)

Predict

Predict

with torch.no_grad(): output = model(input_tensor)['out'][0] output_predictions = output.argmax(0)
undefined
with torch.no_grad(): output = model(input_tensor)['out'][0] output_predictions = output.argmax(0)
undefined

Transfer Learning

迁移学习

python
from torchvision import models
python
from torchvision import models

Load pre-trained ResNet

Load pre-trained ResNet

model = models.resnet50(pretrained=True)
model = models.resnet50(pretrained=True)

Freeze all layers

Freeze all layers

for param in model.parameters(): param.requires_grad = False
for param in model.parameters(): param.requires_grad = False

Replace final layer

Replace final layer

num_features = model.fc.in_features model.fc = nn.Linear(num_features, num_classes)
num_features = model.fc.in_features model.fc = nn.Linear(num_features, num_classes)

Train only final layer

Train only final layer

optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
undefined
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
undefined

Image Processing with OpenCV

基于OpenCV的图像处理

python
import cv2
python
import cv2

Read image

Read image

img = cv2.imread('image.jpg')
img = cv2.imread('image.jpg')

Convert to grayscale

Convert to grayscale

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

Edge detection

Edge detection

edges = cv2.Canny(gray, 100, 200)
edges = cv2.Canny(gray, 100, 200)

Blur

Blur

blurred = cv2.GaussianBlur(img, (5, 5), 0)
blurred = cv2.GaussianBlur(img, (5, 5), 0)

Resize

Resize

resized = cv2.resize(img, (224, 224))
resized = cv2.resize(img, (224, 224))

Draw rectangle

Draw rectangle

cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)

Save

Save

cv2.imwrite('output.jpg', img)
undefined
cv2.imwrite('output.jpg', img)
undefined

Face Detection

人脸检测

python
undefined
python
undefined

Haar Cascade

Haar Cascade

face_cascade = cv2.CascadeClassifier( cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' )
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, 1.1, 4)
for (x, y, w, h) in faces: cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
undefined
face_cascade = cv2.CascadeClassifier( cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' )
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, 1.1, 4)
for (x, y, w, h) in faces: cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
undefined

Common Architectures

常见网络架构

Image Classification:
  • ResNet: Skip connections, deep networks
  • EfficientNet: Compound scaling, efficient
  • Vision Transformer (ViT): Attention-based
Object Detection:
  • YOLO: Real-time, one-stage
  • Faster R-CNN: Two-stage, accurate
  • RetinaNet: Focal loss, handles class imbalance
Segmentation:
  • U-Net: Encoder-decoder, medical imaging
  • DeepLab: Atrous convolution, semantic segmentation
  • Mask R-CNN: Instance segmentation
图像分类:
  • ResNet: 残差连接,深度网络
  • EfficientNet: 复合缩放,高效
  • Vision Transformer (ViT): 基于注意力机制
目标检测:
  • YOLO: 实时检测,单阶段
  • Faster R-CNN: 双阶段,高精度
  • RetinaNet: 焦点损失,处理类别不平衡
图像分割:
  • U-Net: 编码器-解码器结构,医学影像
  • DeepLab: 空洞卷积,语义分割
  • Mask R-CNN: 实例分割

Tips

小贴士

  1. Use pre-trained models for transfer learning
  2. Apply data augmentation to prevent overfitting
  3. Normalize images (ImageNet statistics)
  4. Use appropriate loss functions (CrossEntropy, Focal Loss)
  5. Monitor training with visualization
  6. Test on diverse images
  1. 使用预训练模型进行迁移学习
  2. 应用数据增强防止过拟合
  3. 对图像进行归一化(采用ImageNet统计数据)
  4. 使用合适的损失函数(交叉熵损失、焦点损失)
  5. 通过可视化监控训练过程
  6. 在多样化图像上进行测试