computer-vision
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseComputer Vision
计算机视觉
Build models to analyze and understand visual data.
构建用于分析和理解视觉数据的模型。
Quick Start
快速入门
Image Classification
图像分类
python
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Imagepython
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import ImageLoad pre-trained model
Load pre-trained model
model = models.resnet50(pretrained=True)
model.eval()
model = models.resnet50(pretrained=True)
model.eval()
Preprocess image
Preprocess image
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
img = Image.open('image.jpg')
img_tensor = transform(img).unsqueeze(0)
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
img = Image.open('image.jpg')
img_tensor = transform(img).unsqueeze(0)
Predict
Predict
with torch.no_grad():
output = model(img_tensor)
probabilities = torch.nn.functional.softmax(output[0], dim=0)
top5 = torch.topk(probabilities, 5)
print(top5)
undefinedwith torch.no_grad():
output = model(img_tensor)
probabilities = torch.nn.functional.softmax(output[0], dim=0)
top5 = torch.topk(probabilities, 5)
print(top5)
undefinedCustom CNN
自定义CNN
python
import torch.nn as nn
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return xpython
import torch.nn as nn
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return xData Augmentation
数据增强
python
from torchvision import transforms
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(
brightness=0.2,
contrast=0.2,
saturation=0.2,
hue=0.1
),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])python
from torchvision import transforms
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(
brightness=0.2,
contrast=0.2,
saturation=0.2,
hue=0.1
),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])Object Detection with YOLO
基于YOLO的目标检测
python
from ultralytics import YOLOpython
from ultralytics import YOLOLoad model
Load model
model = YOLO('yolov8n.pt')
model = YOLO('yolov8n.pt')
Predict
Predict
results = model('image.jpg')
results = model('image.jpg')
Process results
Process results
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
confidence = box.conf[0]
class_id = box.cls[0]
print(f"Class: {class_id}, Confidence: {confidence:.2f}")
print(f"Box: ({x1}, {y1}, {x2}, {y2})")
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
confidence = box.conf[0]
class_id = box.cls[0]
print(f"Class: {class_id}, Confidence: {confidence:.2f}")
print(f"Box: ({x1}, {y1}, {x2}, {y2})")
Save results
Save results
results[0].save('output.jpg')
undefinedresults[0].save('output.jpg')
undefinedImage Segmentation
图像分割
python
undefinedpython
undefinedSemantic segmentation with DeepLab
Semantic segmentation with DeepLab
model = torch.hub.load(
'pytorch/vision:v0.10.0',
'deeplabv3_resnet50',
pretrained=True
)
model.eval()
model = torch.hub.load(
'pytorch/vision:v0.10.0',
'deeplabv3_resnet50',
pretrained=True
)
model.eval()
Preprocess
Preprocess
preprocess = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
input_tensor = preprocess(img).unsqueeze(0)
preprocess = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
input_tensor = preprocess(img).unsqueeze(0)
Predict
Predict
with torch.no_grad():
output = model(input_tensor)['out'][0]
output_predictions = output.argmax(0)
undefinedwith torch.no_grad():
output = model(input_tensor)['out'][0]
output_predictions = output.argmax(0)
undefinedTransfer Learning
迁移学习
python
from torchvision import modelspython
from torchvision import modelsLoad pre-trained ResNet
Load pre-trained ResNet
model = models.resnet50(pretrained=True)
model = models.resnet50(pretrained=True)
Freeze all layers
Freeze all layers
for param in model.parameters():
param.requires_grad = False
for param in model.parameters():
param.requires_grad = False
Replace final layer
Replace final layer
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
Train only final layer
Train only final layer
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
undefinedoptimizer = optim.Adam(model.fc.parameters(), lr=0.001)
undefinedImage Processing with OpenCV
基于OpenCV的图像处理
python
import cv2python
import cv2Read image
Read image
img = cv2.imread('image.jpg')
img = cv2.imread('image.jpg')
Convert to grayscale
Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
Edge detection
Edge detection
edges = cv2.Canny(gray, 100, 200)
edges = cv2.Canny(gray, 100, 200)
Blur
Blur
blurred = cv2.GaussianBlur(img, (5, 5), 0)
blurred = cv2.GaussianBlur(img, (5, 5), 0)
Resize
Resize
resized = cv2.resize(img, (224, 224))
resized = cv2.resize(img, (224, 224))
Draw rectangle
Draw rectangle
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
Save
Save
cv2.imwrite('output.jpg', img)
undefinedcv2.imwrite('output.jpg', img)
undefinedFace Detection
人脸检测
python
undefinedpython
undefinedHaar Cascade
Haar Cascade
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.1, 4)
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
undefinedface_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.1, 4)
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
undefinedCommon Architectures
常见网络架构
Image Classification:
- ResNet: Skip connections, deep networks
- EfficientNet: Compound scaling, efficient
- Vision Transformer (ViT): Attention-based
Object Detection:
- YOLO: Real-time, one-stage
- Faster R-CNN: Two-stage, accurate
- RetinaNet: Focal loss, handles class imbalance
Segmentation:
- U-Net: Encoder-decoder, medical imaging
- DeepLab: Atrous convolution, semantic segmentation
- Mask R-CNN: Instance segmentation
图像分类:
- ResNet: 残差连接,深度网络
- EfficientNet: 复合缩放,高效
- Vision Transformer (ViT): 基于注意力机制
目标检测:
- YOLO: 实时检测,单阶段
- Faster R-CNN: 双阶段,高精度
- RetinaNet: 焦点损失,处理类别不平衡
图像分割:
- U-Net: 编码器-解码器结构,医学影像
- DeepLab: 空洞卷积,语义分割
- Mask R-CNN: 实例分割
Tips
小贴士
- Use pre-trained models for transfer learning
- Apply data augmentation to prevent overfitting
- Normalize images (ImageNet statistics)
- Use appropriate loss functions (CrossEntropy, Focal Loss)
- Monitor training with visualization
- Test on diverse images
- 使用预训练模型进行迁移学习
- 应用数据增强防止过拟合
- 对图像进行归一化(采用ImageNet统计数据)
- 使用合适的损失函数(交叉熵损失、焦点损失)
- 通过可视化监控训练过程
- 在多样化图像上进行测试