Problem: Photo of document from angle
Solution: Perspective transform to frontal view
Steps:
Result: Rectangular, readable document
import cv2import numpy as npgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)# threshold to binary_, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)# find contours#ans: contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)# find largest contour#ans: largest = max(contours, key=cv2.contourArea)# approximate to polygonepsilon = 0.02 * cv2.arcLength(largest, True)#ans: approx = cv2.approxPolyDP(largest, epsilon, True)#ans: if len(approx) == 4, it's a rectangle (document)
# corners may be in any order, need to order: TL, TR, BR, BLdef order_points(pts): rect = np.zeros((4, 2), dtype='float32') # sum: TL has smallest, BR has largest s = pts.sum(axis=1) rect[0] = pts[np.argmin(s)] # top-left rect[2] = pts[np.argmax(s)] # bottom-right # diff: TR has smallest, BL has largest diff = np.diff(pts, axis=1) rect[1] = pts[np.argmin(diff)] # top-right rect[3] = pts[np.argmax(diff)] # bottom-left return rect#ans: ordered_pts = order_points(approx.reshape(4, 2))
# compute width and height of output#ans: (tl, tr, br, bl) = ordered_pts# width = max of top and bottom widths#ans: widthA = np.sqrt((br[0] - bl[0])**2 + (br[1] - bl[1])**2)#ans: widthB = np.sqrt((tr[0] - tl[0])**2 + (tr[1] - tl[1])**2)#ans: maxWidth = int(max(widthA, widthB))# height = max of left and right heights#ans: heightA = np.sqrt((tr[0] - br[0])**2 + (tr[1] - br[1])**2)#ans: heightB = np.sqrt((tl[0] - bl[0])**2 + (tl[1] - bl[1])**2)#ans: maxHeight = int(max(heightA, heightB))
# source points (detected corners)src = ordered_pts# destination points (rectangle)#ans: dst = np.array([[0, 0],#ans: [maxWidth - 1, 0],#ans: [maxWidth - 1, maxHeight - 1],#ans: [0, maxHeight - 1]], dtype='float32')# compute perspective transform#ans: M = cv2.getPerspectiveTransform(src, dst)# apply transform#ans: warped = cv2.warpPerspective(img, M, (maxWidth, maxHeight))
Purpose: Top-down view of scene
Use case: Lane detection, parking assistance, sports analysis
Method: Same as document scan
# road region (trapezoid)h, w = img.shape[:2]#ans: src = np.float32([[w*0.45, h*0.6],#ans: [w*0.55, h*0.6],#ans: [w*0.9, h],#ans: [w*0.1, h]])# bird's eye view (rectangle)#ans: dst = np.float32([[0, 0],#ans: [w, 0],#ans: [w, h],#ans: [0, h]])# transformM = cv2.getPerspectiveTransform(src, dst)#ans: birds_eye = cv2.warpPerspective(img, M, (w, h))
Purpose: Map bird's eye view back to original perspective
Method: Use inverse of homography matrix
# forward transformM = cv2.getPerspectiveTransform(src, dst)birds_eye = cv2.warpPerspective(img, M, (w, h))# inverse transform#ans: M_inv = cv2.getPerspectiveTransform(dst, src)#ans: original = cv2.warpPerspective(birds_eye, M_inv, (w, h))# or use numpy inverse#ans: M_inv = np.linalg.inv(M)
# what is perspective transform used for?#ans: document scanning, bird's eye view, rectification# how many points needed?#ans: 4 point pairs (source and destination)# what is homography?#ans: 3×3 perspective transformation matrix# why order corner points?#ans: ensure correct TL, TR, BR, BL correspondence# what is bird's eye view?#ans: top-down perspective of scene
# how to compute output rectangle size?#ans: measure distances in source, take max width/height# what is inverse perspective?#ans: transform back from warped to original view# how to get inverse homography?#ans: cv2.getPerspectiveTransform(dst, src) or np.linalg.inv(M)# document scan steps?#ans: detect corners, order points, compute homography, warp
# perspective transform with 4 pointspts1 = np.float32([[56, 65], [368, 52], [28, 387], [389, 390]])pts2 = np.float32([[0, 0], [300, 0], [0, 300], [300, 300]])#ans: M = cv2.getPerspectiveTransform(pts1, pts2)#ans: result = cv2.warpPerspective(img, M, (300, 300))
# order points functiondef order_points(pts): rect = np.zeros((4, 2), dtype='float32') s = pts.sum(axis=1)#ans: rect[0] = pts[np.argmin(s)] # TL#ans: rect[2] = pts[np.argmax(s)] # BR diff = np.diff(pts, axis=1)#ans: rect[1] = pts[np.argmin(diff)] # TR#ans: rect[3] = pts[np.argmax(diff)] # BL return rect
# bird's eye view transformh, w = img.shape[:2]src = np.float32([[w*0.45, h*0.6], [w*0.55, h*0.6], [w*0.9, h], [w*0.1, h]])dst = np.float32([[0, 0], [w, 0], [w, h], [0, h]])#ans: M = cv2.getPerspectiveTransform(src, dst)#ans: birds_eye = cv2.warpPerspective(img, M, (w, h))# inverse transform#ans: M_inv = cv2.getPerspectiveTransform(dst, src)#ans: original = cv2.warpPerspective(birds_eye, M_inv, (w, h))
Google tag (gtag.js)