10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 30 template <
typename ArgType>
31 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int MulCost() {
32 return internal::functor_traits<
33 internal::scalar_product_op<ArgType, ArgType> >::Cost;
35 template <
typename ArgType>
36 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int AddCost() {
37 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
39 template <
typename ArgType>
40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int DivCost() {
41 return internal::functor_traits<
42 internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
44 template <
typename ArgType>
45 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int ModCost() {
46 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
48 template <
typename SrcType,
typename TargetType>
49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int CastCost() {
50 return internal::functor_traits<
51 internal::scalar_cast_op<SrcType, TargetType> >::Cost;
55 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
57 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles)
58 : bytes_loaded_(bytes_loaded),
59 bytes_stored_(bytes_stored),
60 compute_cycles_(compute_cycles) {}
63 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles,
64 bool vectorized,
double packet_size)
65 : bytes_loaded_(bytes_loaded),
66 bytes_stored_(bytes_stored),
67 compute_cycles_(vectorized ? compute_cycles / packet_size
69 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
70 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
71 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
74 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_loaded()
const {
77 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_stored()
const {
80 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double compute_cycles()
const {
81 return compute_cycles_;
83 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double total_cost(
84 double load_cost,
double store_cost,
double compute_cost)
const {
85 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
86 compute_cost * compute_cycles_;
91 EIGEN_DEVICE_FUNC
void dropMemoryCost() {
97 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
98 const TensorOpCost& rhs)
const {
99 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
100 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
101 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
102 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
107 const TensorOpCost& rhs)
const {
108 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
109 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
110 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
111 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
115 const TensorOpCost& rhs) {
116 bytes_loaded_ += rhs.bytes_loaded();
117 bytes_stored_ += rhs.bytes_stored();
118 compute_cycles_ += rhs.compute_cycles();
122 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(
double rhs) {
123 bytes_loaded_ *= rhs;
124 bytes_stored_ *= rhs;
125 compute_cycles_ *= rhs;
129 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator+(
130 TensorOpCost lhs,
const TensorOpCost& rhs) {
134 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
135 TensorOpCost lhs,
double rhs) {
139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
140 double lhs, TensorOpCost rhs) {
145 friend std::ostream& operator<<(std::ostream& os,
const TensorOpCost& tc) {
146 return os <<
"[bytes_loaded = " << tc.bytes_loaded()
147 <<
", bytes_stored = " << tc.bytes_stored()
148 <<
", compute_cycles = " << tc.compute_cycles() <<
"]";
152 double bytes_loaded_;
153 double bytes_stored_;
154 double compute_cycles_;
160 template <
typename Device>
161 class TensorCostModel {
164 static const int kDeviceCyclesPerComputeCycle = 1;
167 static const int kStartupCycles = 100000;
168 static const int kPerThreadCycles = 100000;
169 static const int kTaskSize = 40000;
174 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int numThreads(
175 double output_size,
const TensorOpCost& cost_per_coeff,
int max_threads) {
176 double cost = totalCost(output_size, cost_per_coeff);
177 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178 return numext::mini(max_threads, numext::maxi(1, threads));
184 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double taskSize(
185 double output_size,
const TensorOpCost& cost_per_coeff) {
186 return totalCost(output_size, cost_per_coeff) / kTaskSize;
190 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double totalCost(
191 double output_size,
const TensorOpCost& cost_per_coeff) {
201 const double kLoadCycles = 1.0 / 64 * 11;
202 const double kStoreCycles = 1.0 / 64 * 11;
205 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
206 kDeviceCyclesPerComputeCycle);
212 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45