Reputation: 96
I'm trying to transform a point in a 3D world rendered with pyrender to pixel coordinates. The world to camera frame transformation seems to work, however the camera to pixel frame transformation is incorrect and I can't figure out what I'm doing wrong. I appreciate any hints!
The goal is to get the pixel coordinates uvw of the world-point UVW. Currently, I do the following:
I create a camera from an already existing intrinsic matrix (= K). I do this mainly for debugging purposes, so I can be sure that K is correct:
K = np.array([[415.69219382, 0. , 320. ],
[ 0. , 415.69219382, 240. ],
[ 0. , 0. , 1. ]])
K = np.ascontiguousarray(K, dtype=np.float32)
p_cam = pyrender.camera.IntrinsicsCamera(fx = K[0][0], fy = [1][1], cx =[0][2], cy = [1][2])
scene.add(p_cam, pose=cam_pose.get_transformation_matrix(x=6170., y=4210., z=60., yaw=20, pitch=0, roll=40)) # cam_pose is my own class
I'm creating an transformation matrix with an extrinsic rotation.
def get_transformation_matrix(self, x, y, z, yaw, pitch, roll):
from scipy.spatial.transform import Rotation as R
'''
yaw = rotate around z axis
pitch = rotate around y axis
roll = rotate around x axis
'''
xyz = np.array([
[x],
[y],
[z]
])
rot = rot_matrix = R.from_euler('zyx', [yaw, pitch, roll], degrees=True).as_matrix()
last_row = np.array([[0,0,0,1]])
tf_m = np.concatenate((np.concatenate((rot,xyz), axis = 1), last_row), axis = 0)
return np.ascontiguousarray(tf_m, dtype=np.float32)
Using the created camera, I render the following image. The point I'm trying to transform is the tip of the roof, which approximately has the pixel coordinates (500,160). I marked it in the 3D scene with the pink cylinder.
from icecream import ic
K = np.concatenate((K, [[0],[0],[0]]), axis = 1)
UVW1 = [[6184],[4245],[38],[1]] #the homogeneous coordinates of the pink cylinder in the world frame
world_to_camera = np.linalg.inv(cam_pose.transformation_matrix).astype('float32') @ UVW1
ic(world_to_camera)
camera_to_pixel = K @ world_to_camera
ic(camera_to_pixel/camera_to_pixel[2]) #Transforming the homogeneous coordinates back
Output:
ic| world_to_camera: array([[ 17.48892188],
[ 7.11796755],
[-39.35071968],
[ 1. ]])
ic| camera_to_pixel/camera_to_pixel[2]: array([[135.25094424],
[164.80738424],
[ 1. ]])
To me, the world_to_camera pose seems like it might be correct (i might be wrong). However, when transforming from camera frame to pixel frame, the x-coordinate (135) is wrong (the y-coordinate (164) might still make sense).
Attached a screenshot of the 3D scene. The yellow cylinder+axes represent the camera, while the blue point represents the point I'm trying to transform (earlier pink in the rendered image).
So to me, the only source of error could be the intrinsic matrix, however I'm defining this matrix myself, so I don't see how it could be incorrect. Is there something I'm blind to?
Upvotes: 1
Views: 394
Reputation: 1
late answer, but just in case. your code is actually correct, but your are using the view camera (gl representation) which is required for rendering instead of the actual world_to_camera (openCV representation). So change this
world_to_camera= p.linalg.inv(cam_pose.transformation_matrix).astype('float32')
to:
world_to_camera = (cam_pose.transformation_matrix).astype('float32')
rendered 3d projected 3d to image pixels
here is a complete script that one may use or modify
import cv2
import numpy as np
import pyrender
import trimesh
from transformations import rotation_matrix
def draw_transformed_3d_axes(
image: np.ndarray,
transform: np.ndarray,
loc: np.ndarray,
scale: float,
projection_matrix: np.ndarray,
) -> None:
"""Draw a transformed set of coordinate axes, in color."""
trsf_4x4 = np.eye(4)
trsf_4x4[:3, :3] = transform
axes_edges = np.array([[0, 1], [0, 2], [0, 3]])
axes_verts = np.vstack([np.zeros((1, 3)), np.eye(3)]) * 3.0
axes_verts = np.hstack([axes_verts, np.ones((len(axes_verts), 1))])
axes_verts = np.array([0, 0, 10]) + axes_verts.dot(trsf_4x4.T)[:, :-1]
projected = axes_verts.dot(projection_matrix.T)
projected = projected[:, :2] / projected[:, 2:]
center = np.array([image.shape[0] // 2, image.shape[1] // 2])
projected = ((projected - center) * scale + loc).astype(int)
ldmk_connection_pairs = projected[axes_edges].astype(int)
for p_0, p_1 in ldmk_connection_pairs:
cv2.line(image, tuple(p_0 + 1), tuple(p_1 + 1), (0, 0, 0), 2, cv2.LINE_AA)
colors = np.fliplr(np.eye(3) * 255)
for i, (p_0, p_1) in enumerate(ldmk_connection_pairs):
cv2.line(image, tuple(p_0), tuple(p_1), colors[i], 2, cv2.LINE_AA)
def _render_mesh(
vertices: np.ndarray,
triangles: np.ndarray,
world_to_cam: np.ndarray,
cam_to_img: np.ndarray,
resolution: tuple[int, int],
) -> np.ndarray:
renderer = pyrender.OffscreenRenderer(resolution[0], resolution[1])
camera_pr = pyrender.IntrinsicsCamera(
cx=cam_to_img[0, 2],
cy=cam_to_img[1, 2],
fx=cam_to_img[0, 0],
fy=cam_to_img[1, 1],
zfar=5000.0,
name="cam",
)
scene = pyrender.Scene(ambient_light=[100, 100, 100], bg_color=[0, 0, 0, 0])
# OpenCV to OpenGL convention
world_to_cam_gl = np.linalg.inv(world_to_cam).dot(rotation_matrix(np.pi, [1, 0, 0]))
camera_node = pyrender.Node(camera=camera_pr, matrix=world_to_cam_gl)
scene.add_node(camera_node)
key_light = pyrender.DirectionalLight(color=np.ones(3), intensity=4.0)
R1 = rotation_matrix(np.radians(25), [0, 1, 0])
R2 = rotation_matrix(np.radians(-30), [1, 0, 0])
key_pose = world_to_cam_gl.dot(R1.dot(R2))
scene.add(key_light, pose=key_pose)
back_light = pyrender.DirectionalLight(color=np.ones(3), intensity=1.0)
R1 = rotation_matrix(np.radians(-150), [0, 1, 0])
back_pose = world_to_cam_gl.dot(R1)
scene.add(back_light, pose=back_pose)
mesh_trimesh = trimesh.Trimesh(vertices, triangles, process=False)
colors = np.repeat([[255, 61, 13]], len(vertices), axis=0)
mesh_trimesh.visual.vertex_colors = colors
mesh_pyrender = pyrender.Mesh.from_trimesh(mesh_trimesh, smooth=True)
mesh_pyrender.primitives[0].material.roughnessFactor = 0.6
mesh_pyrender.primitives[0].material.alphaMode = "OPAQUE"
scene.add(mesh_pyrender)
rendered_img, _ = renderer.render(scene, flags=pyrender.RenderFlags.RGBA | pyrender.RenderFlags.ALL_SOLID)
renderer.delete()
return rendered_img.astype(float) / 255
def project_3D_to_2D_pixel(
vertices: np.ndarray,
triangles: np.ndarray,
world_to_cam: np.ndarray,
cam_to_img: np.ndarray,
resolution: tuple[int, int]
) :
cam_to_img= np.concatenate((cam_to_img, [[0],[0],[0]]), axis = 1)
world_to_image = cam_to_img@world_to_cam
W2P = (world_to_image[:,:3]@vertices.T + world_to_image[:,3:4]).T
pixels = np.zeros_like(W2P[:,0:2])
for idx, s in enumerate(W2P[:,2]):
if (abs(s) > 1e-6):
pixels[idx,0] = W2P[idx,0]/s
pixels[idx,1] = W2P[idx,1]/s
#assuming 512x512 image
img=np.zeros((512,512)).astype('uint8')
[cv2.circle(img, (int(p[0]), int(p[1])), 1,255,-1) for p in pixels]
# import ipdb;ipdb.set_trace()
return img
def draw_mesh_from_VF(
vertices: np.ndarray,
triangles:np.ndarray,
image: np.ndarray,
world_to_cam: np.ndarray,
cam_to_img: np.ndarray,
) -> None:
render = _render_mesh(vertices, triangles, world_to_cam, cam_to_img, image.shape[:2][::-1])
# alpha blend
return (
((image.astype(np.float64) / 255) * (1 - 0.75 * render[..., -1:]) + render[..., :3] * 0.75 * render[..., -1:])
* 255
).astype(np.uint8)
Upvotes: 0