facebookresearch · yuanluxu · Jan 24, 2020 · Jan 24, 2020 · Jan 27, 2020 · Jan 27, 2020
diff --git a/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu b/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
@@ -6,7 +6,7 @@
 template <typename scalar_t>
 __global__ void FaceAreasNormalsKernel(
     const scalar_t* __restrict__ verts,
-    const long* __restrict__ faces,
+    const int64_t* __restrict__ faces,
     scalar_t* __restrict__ face_areas,
     scalar_t* __restrict__ face_normals,
     const size_t V,
@@ -18,9 +18,9 @@ __global__ void FaceAreasNormalsKernel(
   // Each thread computes the area & normal of its respective faces and adds it
   // to the global face_areas tensor.
   for (size_t f = tid; f < F; f += stride) {
-    const long i0 = faces[3 * f + 0];
-    const long i1 = faces[3 * f + 1];
-    const long i2 = faces[3 * f + 2];
+    const int64_t i0 = faces[3 * f + 0];
+    const int64_t i1 = faces[3 * f + 1];
+    const int64_t i2 = faces[3 * f + 2];
 
     const scalar_t v0_x = verts[3 * i0 + 0];
     const scalar_t v0_y = verts[3 * i0 + 1];
@@ -70,12 +70,12 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsCuda(
                                FaceAreasNormalsKernel<scalar_t>
                                    <<<blocks, threads>>>(
                                        verts.data_ptr<scalar_t>(),
-                                       faces.data_ptr<long>(),
+                                       faces.data_ptr<int64_t>(),
                                        areas.data_ptr<scalar_t>(),
                                        normals.data_ptr<scalar_t>(),
                                        V,
                                        F);
                              }));
-
+  
   return std::make_tuple(areas, normals);
 }
diff --git a/pytorch3d/csrc/gather_scatter/gather_scatter.cu b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@@ -5,7 +5,7 @@
 // TODO(T47953967) to make this cuda kernel support all datatypes.
 __global__ void gather_scatter_kernel(
     const float* __restrict__ input,
-    const long* __restrict__ edges,
+    const int64_t* __restrict__ edges,
     float* __restrict__ output,
     bool directed,
     bool backward,
@@ -21,8 +21,8 @@ __global__ void gather_scatter_kernel(
   // Edges are split evenly across the blocks.
   for (int e = blockIdx.x; e < E; e += gridDim.x) {
     // Get indices of vertices which form the edge.
-    const long v0 = edges[2 * e + v0_idx];
-    const long v1 = edges[2 * e + v1_idx];
+    const int64_t v0 = edges[2 * e + v0_idx];
+    const int64_t v1 = edges[2 * e + v1_idx];
 
     // Split vertex features evenly across threads.
     // This implementation will be quite wasteful when D<128 since there will be
@@ -57,7 +57,7 @@ at::Tensor gather_scatter_cuda(
 
   gather_scatter_kernel<<<blocks, threads>>>(
       input.data_ptr<float>(),
-      edges.data_ptr<long>(),
+      edges.data_ptr<int64_t>(),
       output.data_ptr<float>(),
       directed,
       backward,

diff --git a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
@@ -6,7 +6,7 @@
 template <typename scalar_t>
 __device__ void WarpReduce(
     volatile scalar_t* min_dists,
-    volatile long* min_idxs,
+    volatile int64_t* min_idxs,
     const size_t tid) {
   // s = 32
   if (min_dists[tid] > min_dists[tid + 32]) {
@@ -57,7 +57,7 @@ template <typename scalar_t>
 __global__ void NearestNeighborKernel(
     const scalar_t* __restrict__ points1,
     const scalar_t* __restrict__ points2,
-    long* __restrict__ idx,
+    int64_t* __restrict__ idx,
     const size_t N,
     const size_t P1,
     const size_t P2,
@@ -74,7 +74,7 @@ __global__ void NearestNeighborKernel(
   extern __shared__ char shared_buf[];
   scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
   scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
-  long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
+  int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
 
   const size_t n = blockIdx.y; // index of batch element.
   const size_t i = blockIdx.x; // index of point within batch element.
@@ -147,14 +147,14 @@ template <typename scalar_t>
 __global__ void NearestNeighborKernelD3(
     const scalar_t* __restrict__ points1,
     const scalar_t* __restrict__ points2,
-    long* __restrict__ idx,
+    int64_t* __restrict__ idx,
     const size_t N,
     const size_t P1,
     const size_t P2) {
   // Single shared memory buffer which is split and cast to different types.
   extern __shared__ char shared_buf[];
   scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
-  long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
+  int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
 
   const size_t D = 3;
   const size_t n = blockIdx.y; // index of batch element.
@@ -230,12 +230,12 @@ at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) {
     // Use the specialized kernel for D=3.
     AT_DISPATCH_FLOATING_TYPES(p1.type(), "nearest_neighbor_v3_cuda", ([&] {
                                  size_t shared_size = threads * sizeof(size_t) +
-                                     threads * sizeof(long);
+                                     threads * sizeof(int64_t);
                                  NearestNeighborKernelD3<scalar_t>
                                      <<<blocks, threads, shared_size>>>(
                                          p1.data_ptr<scalar_t>(),
                                          p2.data_ptr<scalar_t>(),
-                                         idx.data_ptr<long>(),
+                                         idx.data_ptr<int64_t>(),
                                          N,
                                          P1,
                                          P2);
@@ -248,11 +248,11 @@ at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) {
           // need to be rounded to the next even size.
           size_t D_2 = D + (D % 2);
           size_t shared_size = (D_2 + threads) * sizeof(size_t);
-          shared_size += threads * sizeof(long);
+          shared_size += threads * sizeof(int64_t);
           NearestNeighborKernel<scalar_t><<<blocks, threads, shared_size>>>(
               p1.data_ptr<scalar_t>(),
               p2.data_ptr<scalar_t>(),
-              idx.data_ptr<long>(),
+              idx.data_ptr<int64_t>(),
               N,
               P1,
               P2,

diff --git a/pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh b/pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh
@@ -7,7 +7,11 @@
 #include "float_math.cuh"
 
 // Set epsilon for preventing floating point errors and division by 0.
+#ifdef _MSC_VER
+#define kEpsilon 1e-30f
+#else
 const auto kEpsilon = 1e-30;
+#endif
 
 // Determines whether a point p is on the right side of a 2D line segment
 // given by the end points v0, v1.
@@ -93,7 +97,7 @@ BarycentricCoordsBackward(
     const float2& v2,
     const float3& grad_bary_upstream) {
   const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
-  const float area2 = pow(area, 2.0);
+  const float area2 = pow(area, 2.0f);
   const float e0 = EdgeFunctionForward(p, v1, v2);
   const float e1 = EdgeFunctionForward(p, v2, v0);
   const float e2 = EdgeFunctionForward(p, v0, v1);

diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp b/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
@@ -7,7 +7,7 @@
 // Given a pixel coordinate 0 <= i < S, convert it to a normalized device
 // coordinate in the range [-1, 1]. The NDC range is divided into S evenly-sized
 // pixels, and assume that each pixel falls in the *center* of its range.
-inline float PixToNdc(const int i, const int S) {
+static float PixToNdc(const int i, const int S) {
   // NDC x-offset + (i * pixel_width + half_pixel_width)
   return -1 + (2 * i + 1.0f) / S;
 }
@@ -74,7 +74,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
   return std::make_tuple(point_idxs, zbuf, pix_dists);
 }
 
-std::tuple<torch::Tensor, torch::Tensor> RasterizePointsCoarseCpu(
+torch::Tensor RasterizePointsCoarseCpu(
     const torch::Tensor& points,
     const int image_size,
     const float radius,
@@ -140,7 +140,7 @@ std::tuple<torch::Tensor, torch::Tensor> RasterizePointsCoarseCpu(
       bin_y_max = bin_y_min + bin_width;
     }
   }
-  return std::make_tuple(points_per_bin, bin_points);
+  return bin_points;
 }
 
 torch::Tensor RasterizePointsBackwardCpu(