torch · soumith · Apr 3, 2017 · May 20, 2016 · May 20, 2016 · May 20, 2016
diff --git a/doc/tensor.md b/doc/tensor.md
@@ -4,14 +4,14 @@
 The `Tensor` class is probably the most important class in
 `Torch`. Almost every package depends on this class. It is *__the__*
 class for handling numeric data. As with   pretty much anything in
-[Torch7](./../index.md), tensors are
+[Torch7](./index.md), tensors are
 [serializable](file.md#torch.File.serialization).
 
 __Multi-dimensional matrix__
 
-A `Tensor` is a potentially multi-dimensional matrix. The number of
-dimensions is unlimited that can be created using
-[LongStorage](storage.md) with more dimensions.
+A `Tensor` is a multi-dimensional matrix. The number of
+dimensions is unlimited (up to what can be created using
+[LongStorage](storage.md)).
 
 Example:
 ```lua

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
@@ -242,32 +242,44 @@ endif()
 # Determine if blas was compiled with the f2c conventions
 IF (BLAS_LIBRARIES)
   SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
+#ifdef WIN32
+  typedef __int64 BLINT;
+#else
+  typedef long BLINT;
+#endif
+BLINT four = 4;
+BLINT one = 1;
 extern double sdot_();
 int main() {
-  int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_DOUBLE_WORKS )
+
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
+#ifdef WIN32
+  typedef __int64 BLINT;
+#else
+  typedef long BLINT;
+#endif
+BLINT four = 4;
+BLINT one = 1;
 extern float sdot_();
 int main() {
   int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_FLOAT_WORKS )
+
   IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
     MESSAGE(STATUS "This BLAS uses the F2C return conventions")
     SET(BLAS_F2C TRUE)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
@@ -41,7 +41,7 @@ CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 IF ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "em64t")
   SET(iccvers "intel64")
-  SET(mkl64s "_lp64")
+  SET(mkl64s "_ilp64")
 ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "32")
   SET(iccvers "ia32")

diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
@@ -73,7 +73,7 @@ SET(AVX2_CODE "
 
   int main()
   {
-    __m256i a;
+    __m256i a = {0};
     a = _mm256_abs_epi16(a);
     return 0;
   }

diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
@@ -9,24 +9,37 @@
 # define ffloat float
 #endif
 
-TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
-TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx);
-TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
-TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
-TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
-TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
+// define MKL_LP64 to get 32bit ints on 64bit platforms
+#ifndef MKL_LP64 
+ // 64bit ints
+ #ifdef WIN32
+  #define BLAS_INT __int64 
+ #else
+  #define BLAS_INT long 
+ #endif
+#else
+ // 32bit ints
+ #define BLAS_INT int
+#endif
 
 
+TH_EXTERNC void dswap_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void sswap_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void dscal_(BLAS_INT *n, double *a, double *x, BLAS_INT *incx);
+TH_EXTERNC void sscal_(BLAS_INT *n, float *a, float *x, BLAS_INT *incx);
+TH_EXTERNC void dcopy_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void scopy_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void daxpy_(BLAS_INT *n, double *a, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void saxpy_(BLAS_INT *n, float *a, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC double ddot_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC ffloat sdot_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void dgemv_(char *trans, BLAS_INT *m, BLAS_INT *n, double *alpha, double *a, BLAS_INT *lda, double *x, BLAS_INT *incx, double *beta, double *y, BLAS_INT *incy);
+TH_EXTERNC void sgemv_(char *trans, BLAS_INT *m, BLAS_INT *n, float *alpha, float *a, BLAS_INT *lda, float *x, BLAS_INT *incx, float *beta, float *y, BLAS_INT *incy);
+TH_EXTERNC void dger_(BLAS_INT *m, BLAS_INT *n, double *alpha, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy, double *a, BLAS_INT *lda);
+TH_EXTERNC void sger_(BLAS_INT *m, BLAS_INT *n, float *alpha, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy, float *a, BLAS_INT *lda);
+TH_EXTERNC void dgemm_(char *transa, char *transb, BLAS_INT *m, BLAS_INT *n, BLAS_INT *k, double *alpha, double *a, BLAS_INT *lda, double *b, BLAS_INT *ldb, double *beta, double *c, BLAS_INT *ldc);
+TH_EXTERNC void sgemm_(char *transa, char *transb, BLAS_INT *m, BLAS_INT *n, BLAS_INT *k, float *alpha, float *a, BLAS_INT *lda, float *b, BLAS_INT *ldb, float *beta, float *c, BLAS_INT *ldc);
+
 
 void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
 {
@@ -39,9 +52,9 @@ void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dswap_(&i_n, x, &i_incx, y, &i_incy);
@@ -70,8 +83,8 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dscal_(&i_n, &a, x, &i_incx);
@@ -99,9 +112,9 @@ void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dcopy_(&i_n, x, &i_incx, y, &i_incy);
@@ -129,9 +142,9 @@ void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
@@ -159,9 +172,9 @@ real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     return (real) ddot_(&i_n, x, &i_incx, y, &i_incy);
@@ -190,11 +203,11 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
       (incx > 0) && (incx <= INT_MAX) &&
       (incy > 0) && (incy <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
@@ -245,11 +258,11 @@ void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
@@ -304,12 +317,12 @@ void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha,
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_k = (int)k;
-    int i_lda = (int)lda;
-    int i_ldb = (int)ldb;
-    int i_ldc = (int)ldc;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_k = (BLAS_INT)k;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_ldb = (BLAS_INT)ldb;
+    BLAS_INT i_ldc = (BLAS_INT)ldc;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);

diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c
@@ -1,4 +1,3 @@
-#if defined(__AVX__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
 #else
@@ -271,4 +270,3 @@ void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdi
   }
 }
 
-#endif // defined(__AVX__)
diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h
@@ -2,22 +2,23 @@
 #define TH_AVX_H
 
 #include <stddef.h>
+#include "THGeneral.h"
 
-void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
-void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
-void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
-void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
+TH_API void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+TH_API void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+TH_API void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
+TH_API void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+TH_API void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+TH_API void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
 
 #endif
diff --git a/lib/TH/vector/AVX2.c b/lib/TH/vector/AVX2.c
@@ -1,4 +1,3 @@
-#if defined(__AVX2__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
 #else
@@ -43,5 +42,3 @@ void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const flo
     z[i] = x[i] + y[i] * c;
   }
 }
-
-#endif // defined(__AVX2__)
diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h
@@ -2,8 +2,9 @@
 #define TH_AVX2_H
 
 #include <stddef.h>
+#include "THGeneral.h"
 
-void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+TH_API void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+TH_API void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
 
 #endif