/* Halide.h -- interface for the 'Halide' library. Copyright (c) 2012-2020 MIT CSAIL, Google, Facebook, Adobe, NVIDIA CORPORATION, and other contributors. Developed by: The Halide team http://halide-lang.org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ----- apps/bgu is Copyright 2016 Google Inc. and is Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http ://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ----- apps/support/cmdline.h is Copyright (c) 2009, Hideyuki Tanaka and is licensed under the BSD 3-Clause license. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef HALIDE_H #define HALIDE_H #ifndef HALIDE_ADD_ATOMIC_MUTEX_H #define HALIDE_ADD_ATOMIC_MUTEX_H #ifndef HALIDE_EXPR_H #define HALIDE_EXPR_H /** \file * Base classes for Halide expressions (\ref Halide::Expr) and statements (\ref Halide::Internal::Stmt) */ #include #include #ifndef HALIDE_INTRUSIVE_PTR_H #define HALIDE_INTRUSIVE_PTR_H /** \file * * Support classes for reference-counting via intrusive shared * pointers. */ #include #include #ifndef HALIDE_HALIDERUNTIME_H #define HALIDE_HALIDERUNTIME_H #ifndef COMPILING_HALIDE_RUNTIME #include #include #include #include #else #error "COMPILING_HALIDE_RUNTIME should never be defined for Halide.h" #endif #ifdef __cplusplus // Forward declare type to allow naming typed handles. // See Type.h for documentation. template struct halide_handle_traits; #endif #ifdef __cplusplus extern "C" { #endif // Note that you should not use "inline" along with HALIDE_ALWAYS_INLINE; // it is not necessary, and may produce warnings for some build configurations. #ifdef _MSC_VER #define HALIDE_ALWAYS_INLINE __forceinline #define HALIDE_NEVER_INLINE __declspec(noinline) #else #define HALIDE_ALWAYS_INLINE __attribute__((always_inline)) inline #define HALIDE_NEVER_INLINE __attribute__((noinline)) #endif #ifndef HALIDE_MUST_USE_RESULT #ifdef __has_attribute #if __has_attribute(nodiscard) // C++17 or later #define HALIDE_MUST_USE_RESULT [[nodiscard]] #elif __has_attribute(warn_unused_result) // Clang/GCC #define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) #else #define HALIDE_MUST_USE_RESULT #endif #else #define HALIDE_MUST_USE_RESULT #endif #endif /** \file * * This file declares the routines used by Halide internally in its * runtime. On platforms that support weak linking, these can be * replaced with user-defined versions by defining an extern "C" * function with the same name and signature. * * When doing Just In Time (JIT) compilation methods on the Func being * compiled must be called instead. The corresponding methods are * documented below. * * All of these functions take a "void *user_context" parameter as their * first argument; if the Halide kernel that calls back to any of these * functions has been compiled with the UserContext feature set on its Target, * then the value of that pointer passed from the code that calls the * Halide kernel is piped through to the function. * * Some of these are also useful to call when using the default * implementation. E.g. halide_shutdown_thread_pool. * * Note that even on platforms with weak linking, some linker setups * may not respect the override you provide. E.g. if the override is * in a shared library and the halide object files are linked directly * into the output, the builtin versions of the runtime functions will * be called. See your linker documentation for more details. On * Linux, LD_DYNAMIC_WEAK=1 may help. * */ // Forward-declare to suppress warnings if compiling as C. struct halide_buffer_t; /** Print a message to stderr. Main use is to support tracing * functionality, print, and print_when calls. Also called by the default * halide_error. This function can be replaced in JITed code by using * halide_custom_print and providing an implementation of halide_print * in AOT code. See Func::set_custom_print. */ // @{ extern void halide_print(void *user_context, const char *); extern void halide_default_print(void *user_context, const char *); typedef void (*halide_print_t)(void *, const char *); extern halide_print_t halide_set_custom_print(halide_print_t print); // @} /** Halide calls this function on runtime errors (for example bounds * checking failures). This function can be replaced in JITed code by * using Func::set_error_handler, or in AOT code by calling * halide_set_error_handler. In AOT code on platforms that support * weak linking (i.e. not Windows), you can also override it by simply * defining your own halide_error. */ // @{ extern void halide_error(void *user_context, const char *); extern void halide_default_error(void *user_context, const char *); typedef void (*halide_error_handler_t)(void *, const char *); extern halide_error_handler_t halide_set_error_handler(halide_error_handler_t handler); // @} /** Cross-platform mutex. Must be initialized with zero and implementation * must treat zero as an unlocked mutex with no waiters, etc. */ struct halide_mutex { uintptr_t _private[1]; }; /** Cross platform condition variable. Must be initialized to 0. */ struct halide_cond { uintptr_t _private[1]; }; /** A basic set of mutex and condition variable functions, which call * platform specific code for mutual exclusion. Equivalent to posix * calls. */ //@{ extern void halide_mutex_lock(struct halide_mutex *mutex); extern void halide_mutex_unlock(struct halide_mutex *mutex); extern void halide_cond_signal(struct halide_cond *cond); extern void halide_cond_broadcast(struct halide_cond *cond); extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mutex); //@} /** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */ struct halide_mutex_array; //@{ extern struct halide_mutex_array *halide_mutex_array_create(int sz); extern void halide_mutex_array_destroy(void *user_context, void *array); extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry); extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry); //@} /** Define halide_do_par_for to replace the default thread pool * implementation. halide_shutdown_thread_pool can also be called to * release resources used by the default thread pool on platforms * where it makes sense. (E.g. On Mac OS, Grand Central Dispatch is * used so %Halide does not own the threads backing the pool and they * cannot be released.) See Func::set_custom_do_task and * Func::set_custom_do_par_for. Should return zero if all the jobs * return zero, or an arbitrarily chosen return value from one of the * jobs otherwise. */ //@{ typedef int (*halide_task_t)(void *user_context, int task_number, uint8_t *closure); extern int halide_do_par_for(void *user_context, halide_task_t task, int min, int size, uint8_t *closure); extern void halide_shutdown_thread_pool(); //@} /** Set a custom method for performing a parallel for loop. Returns * the old do_par_for handler. */ typedef int (*halide_do_par_for_t)(void *, halide_task_t, int, int, uint8_t *); extern halide_do_par_for_t halide_set_custom_do_par_for(halide_do_par_for_t do_par_for); /** An opaque struct representing a semaphore. Used by the task system for async tasks. */ struct halide_semaphore_t { uint64_t _private[2]; }; /** A struct representing a semaphore and a number of items that must * be acquired from it. Used in halide_parallel_task_t below. */ struct halide_semaphore_acquire_t { struct halide_semaphore_t *semaphore; int count; }; extern int halide_semaphore_init(struct halide_semaphore_t *, int n); extern int halide_semaphore_release(struct halide_semaphore_t *, int n); extern bool halide_semaphore_try_acquire(struct halide_semaphore_t *, int n); typedef int (*halide_semaphore_init_t)(struct halide_semaphore_t *, int); typedef int (*halide_semaphore_release_t)(struct halide_semaphore_t *, int); typedef bool (*halide_semaphore_try_acquire_t)(struct halide_semaphore_t *, int); /** A task representing a serial for loop evaluated over some range. * Note that task_parent is a pass through argument that should be * passed to any dependent taks that are invokved using halide_do_parallel_tasks * underneath this call. */ typedef int (*halide_loop_task_t)(void *user_context, int min, int extent, uint8_t *closure, void *task_parent); /** A parallel task to be passed to halide_do_parallel_tasks. This * task may recursively call halide_do_parallel_tasks, and there may * be complex dependencies between seemingly unrelated tasks expressed * using semaphores. If you are using a custom task system, care must * be taken to avoid potential deadlock. This can be done by carefully * respecting the static metadata at the end of the task struct.*/ struct halide_parallel_task_t { // The function to call. It takes a user context, a min and // extent, a closure, and a task system pass through argument. halide_loop_task_t fn; // The closure to pass it uint8_t *closure; // The name of the function to be called. For debugging purposes only. const char *name; // An array of semaphores that must be acquired before the // function is called. Must be reacquired for every call made. struct halide_semaphore_acquire_t *semaphores; int num_semaphores; // The entire range the function should be called over. This range // may be sliced up and the function called multiple times. int min, extent; // A parallel task provides several pieces of metadata to prevent // unbounded resource usage or deadlock. // The first is the minimum number of execution contexts (call // stacks or threads) necessary for the function to run to // completion. This may be greater than one when there is nested // parallelism with internal producer-consumer relationships // (calling the function recursively spawns and blocks on parallel // sub-tasks that communicate with each other via semaphores). If // a parallel runtime calls the function when fewer than this many // threads are idle, it may need to create more threads to // complete the task, or else risk deadlock due to committing all // threads to tasks that cannot complete without more. // // FIXME: Note that extern stages are assumed to only require a // single thread to complete. If the extern stage is itself a // Halide pipeline, this may be an underestimate. int min_threads; // The calls to the function should be in serial order from min to min+extent-1, with only // one executing at a time. If false, any order is fine, and // concurrency is fine. bool serial; }; /** Enqueue some number of the tasks described above and wait for them * to complete. While waiting, the calling threads assists with either * the tasks enqueued, or other non-blocking tasks in the task * system. Note that task_parent should be NULL for top-level calls * and the pass through argument if this call is being made from * another task. */ extern int halide_do_parallel_tasks(void *user_context, int num_tasks, struct halide_parallel_task_t *tasks, void *task_parent); /** If you use the default do_par_for, you can still set a custom * handler to perform each individual task. Returns the old handler. */ //@{ typedef int (*halide_do_task_t)(void *, halide_task_t, int, uint8_t *); extern halide_do_task_t halide_set_custom_do_task(halide_do_task_t do_task); extern int halide_do_task(void *user_context, halide_task_t f, int idx, uint8_t *closure); //@} /** The version of do_task called for loop tasks. By default calls the * loop task with the same arguments. */ // @{ typedef int (*halide_do_loop_task_t)(void *, halide_loop_task_t, int, int, uint8_t *, void *); extern halide_do_loop_task_t halide_set_custom_do_loop_task(halide_do_loop_task_t do_task); extern int halide_do_loop_task(void *user_context, halide_loop_task_t f, int min, int extent, uint8_t *closure, void *task_parent); //@} /** Provide an entire custom tasking runtime via function * pointers. Note that do_task and semaphore_try_acquire are only ever * called by halide_default_do_par_for and * halide_default_do_parallel_tasks, so it's only necessary to provide * those if you are mixing in the default implementations of * do_par_for and do_parallel_tasks. */ // @{ typedef int (*halide_do_parallel_tasks_t)(void *, int, struct halide_parallel_task_t *, void *task_parent); extern void halide_set_custom_parallel_runtime( halide_do_par_for_t, halide_do_task_t, halide_do_loop_task_t, halide_do_parallel_tasks_t, halide_semaphore_init_t, halide_semaphore_try_acquire_t, halide_semaphore_release_t); // @} /** The default versions of the parallel runtime functions. */ // @{ extern int halide_default_do_par_for(void *user_context, halide_task_t task, int min, int size, uint8_t *closure); extern int halide_default_do_parallel_tasks(void *user_context, int num_tasks, struct halide_parallel_task_t *tasks, void *task_parent); extern int halide_default_do_task(void *user_context, halide_task_t f, int idx, uint8_t *closure); extern int halide_default_do_loop_task(void *user_context, halide_loop_task_t f, int min, int extent, uint8_t *closure, void *task_parent); extern int halide_default_semaphore_init(struct halide_semaphore_t *, int n); extern int halide_default_semaphore_release(struct halide_semaphore_t *, int n); extern bool halide_default_semaphore_try_acquire(struct halide_semaphore_t *, int n); // @} struct halide_thread; /** Spawn a thread. Returns a handle to the thread for the purposes of * joining it. The thread must be joined in order to clean up any * resources associated with it. */ extern struct halide_thread *halide_spawn_thread(void (*f)(void *), void *closure); /** Join a thread. */ extern void halide_join_thread(struct halide_thread *); /** Set the number of threads used by Halide's thread pool. Returns * the old number. * * n < 0 : error condition * n == 0 : use a reasonable system default (typically, number of cpus online). * n == 1 : use exactly one thread; this will always enforce serial execution * n > 1 : use a pool of exactly n threads. * * (Note that this is only guaranteed when using the default implementations * of halide_do_par_for(); custom implementations may completely ignore values * passed to halide_set_num_threads().) */ extern int halide_set_num_threads(int n); /** Halide calls these functions to allocate and free memory. To * replace in AOT code, use the halide_set_custom_malloc and * halide_set_custom_free, or (on platforms that support weak * linking), simply define these functions yourself. In JIT-compiled * code use Func::set_custom_allocator. * * If you override them, and find yourself wanting to call the default * implementation from within your override, use * halide_default_malloc/free. * * Note that halide_malloc must return a pointer aligned to the * maximum meaningful alignment for the platform for the purpose of * vector loads and stores. The default implementation uses 32-byte * alignment, which is safe for arm and x86. Additionally, it must be * safe to read at least 8 bytes before the start and beyond the * end. */ //@{ extern void *halide_malloc(void *user_context, size_t x); extern void halide_free(void *user_context, void *ptr); extern void *halide_default_malloc(void *user_context, size_t x); extern void halide_default_free(void *user_context, void *ptr); typedef void *(*halide_malloc_t)(void *, size_t); typedef void (*halide_free_t)(void *, void *); extern halide_malloc_t halide_set_custom_malloc(halide_malloc_t user_malloc); extern halide_free_t halide_set_custom_free(halide_free_t user_free); //@} /** Halide calls these functions to interact with the underlying * system runtime functions. To replace in AOT code on platforms that * support weak linking, define these functions yourself, or use * the halide_set_custom_load_library() and halide_set_custom_get_library_symbol() * functions. In JIT-compiled code, use JITSharedRuntime::set_default_handlers(). * * halide_load_library and halide_get_library_symbol are equivalent to * dlopen and dlsym. halide_get_symbol(sym) is equivalent to * dlsym(RTLD_DEFAULT, sym). */ //@{ extern void *halide_get_symbol(const char *name); extern void *halide_load_library(const char *name); extern void *halide_get_library_symbol(void *lib, const char *name); extern void *halide_default_get_symbol(const char *name); extern void *halide_default_load_library(const char *name); extern void *halide_default_get_library_symbol(void *lib, const char *name); typedef void *(*halide_get_symbol_t)(const char *name); typedef void *(*halide_load_library_t)(const char *name); typedef void *(*halide_get_library_symbol_t)(void *lib, const char *name); extern halide_get_symbol_t halide_set_custom_get_symbol(halide_get_symbol_t user_get_symbol); extern halide_load_library_t halide_set_custom_load_library(halide_load_library_t user_load_library); extern halide_get_library_symbol_t halide_set_custom_get_library_symbol(halide_get_library_symbol_t user_get_library_symbol); //@} /** Called when debug_to_file is used inside %Halide code. See * Func::debug_to_file for how this is called * * Cannot be replaced in JITted code at present. */ extern int32_t halide_debug_to_file(void *user_context, const char *filename, int32_t type_code, struct halide_buffer_t *buf); /** Types in the halide type system. They can be ints, unsigned ints, * or floats (of various bit-widths), or a handle (which is always 64-bits). * Note that the int/uint/float values do not imply a specific bit width * (the bit width is expected to be encoded in a separate value). */ typedef enum halide_type_code_t #if __cplusplus >= 201103L : uint8_t #endif { halide_type_int = 0, //!< signed integers halide_type_uint = 1, //!< unsigned integers halide_type_float = 2, //!< IEEE floating point numbers halide_type_handle = 3, //!< opaque pointer type (void *) halide_type_bfloat = 4, //!< floating point numbers in the bfloat format } halide_type_code_t; // Note that while __attribute__ can go before or after the declaration, // __declspec apparently is only allowed before. #ifndef HALIDE_ATTRIBUTE_ALIGN #ifdef _MSC_VER #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x)) #else #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x))) #endif #endif /** A runtime tag for a type in the halide type system. Can be ints, * unsigned ints, or floats of various bit-widths (the 'bits' * field). Can also be vectors of the same (by setting the 'lanes' * field to something larger than one). This struct should be * exactly 32-bits in size. */ struct halide_type_t { /** The basic type code: signed integer, unsigned integer, or floating point. */ #if __cplusplus >= 201103L HALIDE_ATTRIBUTE_ALIGN(1) halide_type_code_t code; // halide_type_code_t #else HALIDE_ATTRIBUTE_ALIGN(1) uint8_t code; // halide_type_code_t #endif /** The number of bits of precision of a single scalar value of this type. */ HALIDE_ATTRIBUTE_ALIGN(1) uint8_t bits; /** How many elements in a vector. This is 1 for scalar types. */ HALIDE_ATTRIBUTE_ALIGN(2) uint16_t lanes; #ifdef __cplusplus /** Construct a runtime representation of a Halide type from: * code: The fundamental type from an enum. * bits: The bit size of one element. * lanes: The number of vector elements in the type. */ HALIDE_ALWAYS_INLINE halide_type_t(halide_type_code_t code, uint8_t bits, uint16_t lanes = 1) : code(code), bits(bits), lanes(lanes) { } /** Default constructor is required e.g. to declare halide_trace_event * instances. */ HALIDE_ALWAYS_INLINE halide_type_t() : code((halide_type_code_t)0), bits(0), lanes(0) { } HALIDE_ALWAYS_INLINE halide_type_t with_lanes(uint16_t new_lanes) const { return halide_type_t((halide_type_code_t)code, bits, new_lanes); } /** Compare two types for equality. */ HALIDE_ALWAYS_INLINE bool operator==(const halide_type_t &other) const { return as_u32() == other.as_u32(); } HALIDE_ALWAYS_INLINE bool operator!=(const halide_type_t &other) const { return !(*this == other); } HALIDE_ALWAYS_INLINE bool operator<(const halide_type_t &other) const { return as_u32() < other.as_u32(); } /** Size in bytes for a single element, even if width is not 1, of this type. */ HALIDE_ALWAYS_INLINE int bytes() const { return (bits + 7) / 8; } HALIDE_ALWAYS_INLINE uint32_t as_u32() const { uint32_t u; memcpy(&u, this, sizeof(u)); return u; } #endif }; enum halide_trace_event_code_t { halide_trace_load = 0, halide_trace_store = 1, halide_trace_begin_realization = 2, halide_trace_end_realization = 3, halide_trace_produce = 4, halide_trace_end_produce = 5, halide_trace_consume = 6, halide_trace_end_consume = 7, halide_trace_begin_pipeline = 8, halide_trace_end_pipeline = 9, halide_trace_tag = 10 }; struct halide_trace_event_t { /** The name of the Func or Pipeline that this event refers to */ const char *func; /** If the event type is a load or a store, this points to the * value being loaded or stored. Use the type field to safely cast * this to a concrete pointer type and retrieve it. For other * events this is null. */ void *value; /** For loads and stores, an array which contains the location * being accessed. For vector loads or stores it is an array of * vectors of coordinates (the vector dimension is innermost). * * For realization or production-related events, this will contain * the mins and extents of the region being accessed, in the order * min0, extent0, min1, extent1, ... * * For pipeline-related events, this will be null. */ int32_t *coordinates; /** For halide_trace_tag, this points to a read-only null-terminated string * of arbitrary text. For all other events, this will be null. */ const char *trace_tag; /** If the event type is a load or a store, this is the type of * the data. Otherwise, the value is meaningless. */ struct halide_type_t type; /** The type of event */ enum halide_trace_event_code_t event; /* The ID of the parent event (see below for an explanation of * event ancestry). */ int32_t parent_id; /** If this was a load or store of a Tuple-valued Func, this is * which tuple element was accessed. */ int32_t value_index; /** The length of the coordinates array */ int32_t dimensions; #ifdef __cplusplus // If we don't explicitly mark the default ctor as inline, // certain build configurations can fail (notably iOS) HALIDE_ALWAYS_INLINE halide_trace_event_t() { } #endif }; /** Called when Funcs are marked as trace_load, trace_store, or * trace_realization. See Func::set_custom_trace. The default * implementation either prints events via halide_print, or if * HL_TRACE_FILE is defined, dumps the trace to that file in a * sequence of trace packets. The header for a trace packet is defined * below. If the trace is going to be large, you may want to make the * file a named pipe, and then read from that pipe into gzip. * * halide_trace returns a unique ID which will be passed to future * events that "belong" to the earlier event as the parent id. The * ownership hierarchy looks like: * * begin_pipeline * +--trace_tag (if any) * +--trace_tag (if any) * ... * +--begin_realization * | +--produce * | | +--load/store * | | +--end_produce * | +--consume * | | +--load * | | +--end_consume * | +--end_realization * +--end_pipeline * * Threading means that ownership cannot be inferred from the ordering * of events. There can be many active realizations of a given * function, or many active productions for a single * realization. Within a single production, the ordering of events is * meaningful. * * Note that all trace_tag events (if any) will occur just after the begin_pipeline * event, but before any begin_realization events. All trace_tags for a given Func * will be emitted in the order added. */ // @} extern int32_t halide_trace(void *user_context, const struct halide_trace_event_t *event); extern int32_t halide_default_trace(void *user_context, const struct halide_trace_event_t *event); typedef int32_t (*halide_trace_t)(void *user_context, const struct halide_trace_event_t *); extern halide_trace_t halide_set_custom_trace(halide_trace_t trace); // @} /** The header of a packet in a binary trace. All fields are 32-bit. */ struct halide_trace_packet_t { /** The total size of this packet in bytes. Always a multiple of * four. Equivalently, the number of bytes until the next * packet. */ uint32_t size; /** The id of this packet (for the purpose of parent_id). */ int32_t id; /** The remaining fields are equivalent to those in halide_trace_event_t */ // @{ struct halide_type_t type; enum halide_trace_event_code_t event; int32_t parent_id; int32_t value_index; int32_t dimensions; // @} #ifdef __cplusplus // If we don't explicitly mark the default ctor as inline, // certain build configurations can fail (notably iOS) HALIDE_ALWAYS_INLINE halide_trace_packet_t() { } /** Get the coordinates array, assuming this packet is laid out in * memory as it was written. The coordinates array comes * immediately after the packet header. */ HALIDE_ALWAYS_INLINE const int *coordinates() const { return (const int *)(this + 1); } HALIDE_ALWAYS_INLINE int *coordinates() { return (int *)(this + 1); } /** Get the value, assuming this packet is laid out in memory as * it was written. The packet comes immediately after the coordinates * array. */ HALIDE_ALWAYS_INLINE const void *value() const { return (const void *)(coordinates() + dimensions); } HALIDE_ALWAYS_INLINE void *value() { return (void *)(coordinates() + dimensions); } /** Get the func name, assuming this packet is laid out in memory * as it was written. It comes after the value. */ HALIDE_ALWAYS_INLINE const char *func() const { return (const char *)value() + type.lanes * type.bytes(); } HALIDE_ALWAYS_INLINE char *func() { return (char *)value() + type.lanes * type.bytes(); } /** Get the trace_tag (if any), assuming this packet is laid out in memory * as it was written. It comes after the func name. If there is no trace_tag, * this will return a pointer to an empty string. */ HALIDE_ALWAYS_INLINE const char *trace_tag() const { const char *f = func(); // strlen may not be available here while (*f++) { // nothing } return f; } HALIDE_ALWAYS_INLINE char *trace_tag() { char *f = func(); // strlen may not be available here while (*f++) { // nothing } return f; } #endif }; /** Set the file descriptor that Halide should write binary trace * events to. If called with 0 as the argument, Halide outputs trace * information to stdout in a human-readable format. If never called, * Halide checks the for existence of an environment variable called * HL_TRACE_FILE and opens that file. If HL_TRACE_FILE is not defined, * it outputs trace information to stdout in a human-readable * format. */ extern void halide_set_trace_file(int fd); /** Halide calls this to retrieve the file descriptor to write binary * trace events to. The default implementation returns the value set * by halide_set_trace_file. Implement it yourself if you wish to use * a custom file descriptor per user_context. Return zero from your * implementation to tell Halide to print human-readable trace * information to stdout. */ extern int halide_get_trace_file(void *user_context); /** If tracing is writing to a file. This call closes that file * (flushing the trace). Returns zero on success. */ extern int halide_shutdown_trace(); /** All Halide GPU or device backend implementations provide an * interface to be used with halide_device_malloc, etc. This is * accessed via the functions below. */ /** An opaque struct containing per-GPU API implementations of the * device functions. */ struct halide_device_interface_impl_t; /** Each GPU API provides a halide_device_interface_t struct pointing * to the code that manages device allocations. You can access these * functions directly from the struct member function pointers, or by * calling the functions declared below. Note that the global * functions are not available when using Halide as a JIT compiler. * If you are using raw halide_buffer_t in that context you must use * the function pointers in the device_interface struct. * * The function pointers below are currently the same for every GPU * API; only the impl field varies. These top-level functions do the * bookkeeping that is common across all GPU APIs, and then dispatch * to more API-specific functions via another set of function pointers * hidden inside the impl field. */ struct halide_device_interface_t { int (*device_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface); int (*device_free)(void *user_context, struct halide_buffer_t *buf); int (*device_sync)(void *user_context, struct halide_buffer_t *buf); void (*device_release)(void *user_context, const struct halide_device_interface_t *device_interface); int (*copy_to_host)(void *user_context, struct halide_buffer_t *buf); int (*copy_to_device)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface); int (*device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface); int (*device_and_host_free)(void *user_context, struct halide_buffer_t *buf); int (*buffer_copy)(void *user_context, struct halide_buffer_t *src, const struct halide_device_interface_t *dst_device_interface, struct halide_buffer_t *dst); int (*device_crop)(void *user_context, const struct halide_buffer_t *src, struct halide_buffer_t *dst); int (*device_slice)(void *user_context, const struct halide_buffer_t *src, int slice_dim, int slice_pos, struct halide_buffer_t *dst); int (*device_release_crop)(void *user_context, struct halide_buffer_t *buf); int (*wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle, const struct halide_device_interface_t *device_interface); int (*detach_native)(void *user_context, struct halide_buffer_t *buf); int (*compute_capability)(void *user_context, int *major, int *minor); const struct halide_device_interface_impl_t *impl; }; /** Release all data associated with the given device interface, in * particular all resources (memory, texture, context handles) * allocated by Halide. Must be called explicitly when using AOT * compilation. This is *not* thread-safe with respect to actively * running Halide code. Ensure all pipelines are finished before * calling this. */ extern void halide_device_release(void *user_context, const struct halide_device_interface_t *device_interface); /** Copy image data from device memory to host memory. This must be called * explicitly to copy back the results of a GPU-based filter. */ extern int halide_copy_to_host(void *user_context, struct halide_buffer_t *buf); /** Copy image data from host memory to device memory. This should not * be called directly; Halide handles copying to the device * automatically. If interface is NULL and the buf has a non-zero dev * field, the device associated with the dev handle will be * used. Otherwise if the dev field is 0 and interface is NULL, an * error is returned. */ extern int halide_copy_to_device(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface); /** Copy data from one buffer to another. The buffers may have * different shapes and sizes, but the destination buffer's shape must * be contained within the source buffer's shape. That is, for each * dimension, the min on the destination buffer must be greater than * or equal to the min on the source buffer, and min+extent on the * destination buffer must be less that or equal to min+extent on the * source buffer. The source data is pulled from either device or * host memory on the source, depending on the dirty flags. host is * preferred if both are valid. The dst_device_interface parameter * controls the destination memory space. NULL means host memory. */ extern int halide_buffer_copy(void *user_context, struct halide_buffer_t *src, const struct halide_device_interface_t *dst_device_interface, struct halide_buffer_t *dst); /** Give the destination buffer a device allocation which is an alias * for the same coordinate range in the source buffer. Modifies the * device, device_interface, and the device_dirty flag only. Only * supported by some device APIs (others will return * halide_error_code_device_crop_unsupported). Call * halide_device_release_crop instead of halide_device_free to clean * up resources associated with the cropped view. Do not free the * device allocation on the source buffer while the destination buffer * still lives. Note that the two buffers do not share dirty flags, so * care must be taken to update them together as needed. Note that src * and dst are required to have the same number of dimensions. * * Note also that (in theory) device interfaces which support cropping may * still not support cropping a crop (instead, create a new crop of the parent * buffer); in practice, no known implementation has this limitation, although * it is possible that some future implementations may require it. */ extern int halide_device_crop(void *user_context, const struct halide_buffer_t *src, struct halide_buffer_t *dst); /** Give the destination buffer a device allocation which is an alias * for a similar coordinate range in the source buffer, but with one dimension * sliced away in the dst. Modifies the device, device_interface, and the * device_dirty flag only. Only supported by some device APIs (others will return * halide_error_code_device_crop_unsupported). Call * halide_device_release_crop instead of halide_device_free to clean * up resources associated with the sliced view. Do not free the * device allocation on the source buffer while the destination buffer * still lives. Note that the two buffers do not share dirty flags, so * care must be taken to update them together as needed. Note that the dst buffer * must have exactly one fewer dimension than the src buffer, and that slice_dim * and slice_pos must be valid within src. */ extern int halide_device_slice(void *user_context, const struct halide_buffer_t *src, int slice_dim, int slice_pos, struct halide_buffer_t *dst); /** Release any resources associated with a cropped/sliced view of another * buffer. */ extern int halide_device_release_crop(void *user_context, struct halide_buffer_t *buf); /** Wait for current GPU operations to complete. Calling this explicitly * should rarely be necessary, except maybe for profiling. */ extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf); /** Allocate device memory to back a halide_buffer_t. */ extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface); /** Free device memory. */ extern int halide_device_free(void *user_context, struct halide_buffer_t *buf); /** Wrap or detach a native device handle, setting the device field * and device_interface field as appropriate for the given GPU * API. The meaning of the opaque handle is specific to the device * interface, so if you know the device interface in use, call the * more specific functions in the runtime headers for your specific * device API instead (e.g. HalideRuntimeCuda.h). */ // @{ extern int halide_device_wrap_native(void *user_context, struct halide_buffer_t *buf, uint64_t handle, const struct halide_device_interface_t *device_interface); extern int halide_device_detach_native(void *user_context, struct halide_buffer_t *buf); // @} /** Selects which gpu device to use. 0 is usually the display * device. If never called, Halide uses the environment variable * HL_GPU_DEVICE. If that variable is unset, Halide uses the last * device. Set this to -1 to use the last device. */ extern void halide_set_gpu_device(int n); /** Halide calls this to get the desired halide gpu device * setting. Implement this yourself to use a different gpu device per * user_context. The default implementation returns the value set by * halide_set_gpu_device, or the environment variable * HL_GPU_DEVICE. */ extern int halide_get_gpu_device(void *user_context); /** Set the soft maximum amount of memory, in bytes, that the LRU * cache will use to memoize Func results. This is not a strict * maximum in that concurrency and simultaneous use of memoized * reults larger than the cache size can both cause it to * temporariliy be larger than the size specified here. */ extern void halide_memoization_cache_set_size(int64_t size); /** Given a cache key for a memoized result, currently constructed * from the Func name and top-level Func name plus the arguments of * the computation, determine if the result is in the cache and * return it if so. (The internals of the cache key should be * considered opaque by this function.) If this routine returns true, * it is a cache miss. Otherwise, it will return false and the * buffers passed in will be filled, via copying, with memoized * data. The last argument is a list if halide_buffer_t pointers which * represents the outputs of the memoized Func. If the Func does not * return a Tuple, there will only be one halide_buffer_t in the list. The * tuple_count parameters determines the length of the list. * * The return values are: * -1: Signals an error. * 0: Success and cache hit. * 1: Success and cache miss. */ extern int halide_memoization_cache_lookup(void *user_context, const uint8_t *cache_key, int32_t size, struct halide_buffer_t *realized_bounds, int32_t tuple_count, struct halide_buffer_t **tuple_buffers); /** Given a cache key for a memoized result, currently constructed * from the Func name and top-level Func name plus the arguments of * the computation, store the result in the cache for futre access by * halide_memoization_cache_lookup. (The internals of the cache key * should be considered opaque by this function.) Data is copied out * from the inputs and inputs are unmodified. The last argument is a * list if halide_buffer_t pointers which represents the outputs of the * memoized Func. If the Func does not return a Tuple, there will * only be one halide_buffer_t in the list. The tuple_count parameters * determines the length of the list. * * If there is a memory allocation failure, the store does not store * the data into the cache. */ extern int halide_memoization_cache_store(void *user_context, const uint8_t *cache_key, int32_t size, struct halide_buffer_t *realized_bounds, int32_t tuple_count, struct halide_buffer_t **tuple_buffers); /** If halide_memoization_cache_lookup succeeds, * halide_memoization_cache_release must be called to signal the * storage is no longer being used by the caller. It will be passed * the host pointer of one the buffers returned by * halide_memoization_cache_lookup. That is * halide_memoization_cache_release will be called multiple times for * the case where halide_memoization_cache_lookup is handling multiple * buffers. (This corresponds to memoizing a Tuple in Halide.) Note * that the host pointer must be sufficient to get to all information * the relase operation needs. The default Halide cache impleemntation * accomplishes this by storing extra data before the start of the user * modifiable host storage. * * This call is like free and does not have a failure return. */ extern void halide_memoization_cache_release(void *user_context, void *host); /** Free all memory and resources associated with the memoization cache. * Must be called at a time when no other threads are accessing the cache. */ extern void halide_memoization_cache_cleanup(); /** Verify that a given range of memory has been initialized; only used when Target::MSAN is enabled. * * The default implementation simply calls the LLVM-provided __msan_check_mem_is_initialized() function. * * The return value should always be zero. */ extern int halide_msan_check_memory_is_initialized(void *user_context, const void *ptr, uint64_t len, const char *name); /** Verify that the data pointed to by the halide_buffer_t is initialized (but *not* the halide_buffer_t itself), * using halide_msan_check_memory_is_initialized() for checking. * * The default implementation takes pains to only check the active memory ranges * (skipping padding), and sorting into ranges to always check the smallest number of * ranges, in monotonically increasing memory order. * * Most client code should never need to replace the default implementation. * * The return value should always be zero. */ extern int halide_msan_check_buffer_is_initialized(void *user_context, struct halide_buffer_t *buffer, const char *buf_name); /** Annotate that a given range of memory has been initialized; * only used when Target::MSAN is enabled. * * The default implementation simply calls the LLVM-provided __msan_unpoison() function. * * The return value should always be zero. */ extern int halide_msan_annotate_memory_is_initialized(void *user_context, const void *ptr, uint64_t len); /** Mark the data pointed to by the halide_buffer_t as initialized (but *not* the halide_buffer_t itself), * using halide_msan_annotate_memory_is_initialized() for marking. * * The default implementation takes pains to only mark the active memory ranges * (skipping padding), and sorting into ranges to always mark the smallest number of * ranges, in monotonically increasing memory order. * * Most client code should never need to replace the default implementation. * * The return value should always be zero. */ extern int halide_msan_annotate_buffer_is_initialized(void *user_context, struct halide_buffer_t *buffer); extern void halide_msan_annotate_buffer_is_initialized_as_destructor(void *user_context, void *buffer); /** The error codes that may be returned by a Halide pipeline. */ enum halide_error_code_t { /** There was no error. This is the value returned by Halide on success. */ halide_error_code_success = 0, /** An uncategorized error occurred. Refer to the string passed to halide_error. */ halide_error_code_generic_error = -1, /** A Func was given an explicit bound via Func::bound, but this * was not large enough to encompass the region that is used of * the Func by the rest of the pipeline. */ halide_error_code_explicit_bounds_too_small = -2, /** The elem_size field of a halide_buffer_t does not match the size in * bytes of the type of that ImageParam. Probable type mismatch. */ halide_error_code_bad_type = -3, /** A pipeline would access memory outside of the halide_buffer_t passed * in. */ halide_error_code_access_out_of_bounds = -4, /** A halide_buffer_t was given that spans more than 2GB of memory. */ halide_error_code_buffer_allocation_too_large = -5, /** A halide_buffer_t was given with extents that multiply to a number * greater than 2^31-1 */ halide_error_code_buffer_extents_too_large = -6, /** Applying explicit constraints on the size of an input or * output buffer shrank the size of that buffer below what will be * accessed by the pipeline. */ halide_error_code_constraints_make_required_region_smaller = -7, /** A constraint on a size or stride of an input or output buffer * was not met by the halide_buffer_t passed in. */ halide_error_code_constraint_violated = -8, /** A scalar parameter passed in was smaller than its minimum * declared value. */ halide_error_code_param_too_small = -9, /** A scalar parameter passed in was greater than its minimum * declared value. */ halide_error_code_param_too_large = -10, /** A call to halide_malloc returned NULL. */ halide_error_code_out_of_memory = -11, /** A halide_buffer_t pointer passed in was NULL. */ halide_error_code_buffer_argument_is_null = -12, /** debug_to_file failed to open or write to the specified * file. */ halide_error_code_debug_to_file_failed = -13, /** The Halide runtime encountered an error while trying to copy * from device to host. Turn on -debug in your target string to * see more details. */ halide_error_code_copy_to_host_failed = -14, /** The Halide runtime encountered an error while trying to copy * from host to device. Turn on -debug in your target string to * see more details. */ halide_error_code_copy_to_device_failed = -15, /** The Halide runtime encountered an error while trying to * allocate memory on device. Turn on -debug in your target string * to see more details. */ halide_error_code_device_malloc_failed = -16, /** The Halide runtime encountered an error while trying to * synchronize with a device. Turn on -debug in your target string * to see more details. */ halide_error_code_device_sync_failed = -17, /** The Halide runtime encountered an error while trying to free a * device allocation. Turn on -debug in your target string to see * more details. */ halide_error_code_device_free_failed = -18, /** Buffer has a non-zero device but no device interface, which * violates a Halide invariant. */ halide_error_code_no_device_interface = -19, /** An error occurred when attempting to initialize the Matlab * runtime. */ halide_error_code_matlab_init_failed = -20, /** The type of an mxArray did not match the expected type. */ halide_error_code_matlab_bad_param_type = -21, /** There is a bug in the Halide compiler. */ halide_error_code_internal_error = -22, /** The Halide runtime encountered an error while trying to launch * a GPU kernel. Turn on -debug in your target string to see more * details. */ halide_error_code_device_run_failed = -23, /** The Halide runtime encountered a host pointer that violated * the alignment set for it by way of a call to * set_host_alignment */ halide_error_code_unaligned_host_ptr = -24, /** A fold_storage directive was used on a dimension that is not * accessed in a monotonically increasing or decreasing fashion. */ halide_error_code_bad_fold = -25, /** A fold_storage directive was used with a fold factor that was * too small to store all the values of a producer needed by the * consumer. */ halide_error_code_fold_factor_too_small = -26, /** User-specified require() expression was not satisfied. */ halide_error_code_requirement_failed = -27, /** At least one of the buffer's extents are negative. */ halide_error_code_buffer_extents_negative = -28, halide_error_code_unused_29 = -29, halide_error_code_unused_30 = -30, /** A specialize_fail() schedule branch was selected at runtime. */ halide_error_code_specialize_fail = -31, /** The Halide runtime encountered an error while trying to wrap a * native device handle. Turn on -debug in your target string to * see more details. */ halide_error_code_device_wrap_native_failed = -32, /** The Halide runtime encountered an error while trying to detach * a native device handle. Turn on -debug in your target string * to see more details. */ halide_error_code_device_detach_native_failed = -33, /** The host field on an input or output was null, the device * field was not zero, and the pipeline tries to use the buffer on * the host. You may be passing a GPU-only buffer to a pipeline * which is scheduled to use it on the CPU. */ halide_error_code_host_is_null = -34, /** A folded buffer was passed to an extern stage, but the region * touched wraps around the fold boundary. */ halide_error_code_bad_extern_fold = -35, /** Buffer has a non-null device_interface but device is 0, which * violates a Halide invariant. */ halide_error_code_device_interface_no_device = -36, /** Buffer has both host and device dirty bits set, which violates * a Halide invariant. */ halide_error_code_host_and_device_dirty = -37, /** The halide_buffer_t * passed to a halide runtime routine is * nullptr and this is not allowed. */ halide_error_code_buffer_is_null = -38, /** The Halide runtime encountered an error while trying to copy * from one buffer to another. Turn on -debug in your target * string to see more details. */ halide_error_code_device_buffer_copy_failed = -39, /** Attempted to make cropped/sliced alias of a buffer with a device * field, but the device_interface does not support cropping. */ halide_error_code_device_crop_unsupported = -40, /** Cropping/slicing a buffer failed for some other reason. Turn on -debug * in your target string. */ halide_error_code_device_crop_failed = -41, /** An operation on a buffer required an allocation on a * particular device interface, but a device allocation already * existed on a different device interface. Free the old one * first. */ halide_error_code_incompatible_device_interface = -42, /** The dimensions field of a halide_buffer_t does not match the dimensions of that ImageParam. */ halide_error_code_bad_dimensions = -43, /** An expression that would perform an integer division or modulo * by zero was evaluated. */ halide_error_code_device_dirty_with_no_device_support = -44, }; /** Halide calls the functions below on various error conditions. The * default implementations construct an error message, call * halide_error, then return the matching error code above. On * platforms that support weak linking, you can override these to * catch the errors individually. */ /** A call into an extern stage for the purposes of bounds inference * failed. Returns the error code given by the extern stage. */ extern int halide_error_bounds_inference_call_failed(void *user_context, const char *extern_stage_name, int result); /** A call to an extern stage failed. Returned the error code given by * the extern stage. */ extern int halide_error_extern_stage_failed(void *user_context, const char *extern_stage_name, int result); /** Various other error conditions. See the enum above for a * description of each. */ // @{ extern int halide_error_explicit_bounds_too_small(void *user_context, const char *func_name, const char *var_name, int min_bound, int max_bound, int min_required, int max_required); extern int halide_error_bad_type(void *user_context, const char *func_name, uint32_t type_given, uint32_t correct_type); // N.B. The last two args are the bit representation of a halide_type_t extern int halide_error_bad_dimensions(void *user_context, const char *func_name, int32_t dimensions_given, int32_t correct_dimensions); extern int halide_error_access_out_of_bounds(void *user_context, const char *func_name, int dimension, int min_touched, int max_touched, int min_valid, int max_valid); extern int halide_error_buffer_allocation_too_large(void *user_context, const char *buffer_name, uint64_t allocation_size, uint64_t max_size); extern int halide_error_buffer_extents_negative(void *user_context, const char *buffer_name, int dimension, int extent); extern int halide_error_buffer_extents_too_large(void *user_context, const char *buffer_name, int64_t actual_size, int64_t max_size); extern int halide_error_constraints_make_required_region_smaller(void *user_context, const char *buffer_name, int dimension, int constrained_min, int constrained_extent, int required_min, int required_extent); extern int halide_error_constraint_violated(void *user_context, const char *var, int val, const char *constrained_var, int constrained_val); extern int halide_error_param_too_small_i64(void *user_context, const char *param_name, int64_t val, int64_t min_val); extern int halide_error_param_too_small_u64(void *user_context, const char *param_name, uint64_t val, uint64_t min_val); extern int halide_error_param_too_small_f64(void *user_context, const char *param_name, double val, double min_val); extern int halide_error_param_too_large_i64(void *user_context, const char *param_name, int64_t val, int64_t max_val); extern int halide_error_param_too_large_u64(void *user_context, const char *param_name, uint64_t val, uint64_t max_val); extern int halide_error_param_too_large_f64(void *user_context, const char *param_name, double val, double max_val); extern int halide_error_out_of_memory(void *user_context); extern int halide_error_buffer_argument_is_null(void *user_context, const char *buffer_name); extern int halide_error_debug_to_file_failed(void *user_context, const char *func, const char *filename, int error_code); extern int halide_error_unaligned_host_ptr(void *user_context, const char *func_name, int alignment); extern int halide_error_host_is_null(void *user_context, const char *func_name); extern int halide_error_bad_fold(void *user_context, const char *func_name, const char *var_name, const char *loop_name); extern int halide_error_bad_extern_fold(void *user_context, const char *func_name, int dim, int min, int extent, int valid_min, int fold_factor); extern int halide_error_fold_factor_too_small(void *user_context, const char *func_name, const char *var_name, int fold_factor, const char *loop_name, int required_extent); extern int halide_error_requirement_failed(void *user_context, const char *condition, const char *message); extern int halide_error_specialize_fail(void *user_context, const char *message); extern int halide_error_no_device_interface(void *user_context); extern int halide_error_device_interface_no_device(void *user_context); extern int halide_error_host_and_device_dirty(void *user_context); extern int halide_error_buffer_is_null(void *user_context, const char *routine); extern int halide_error_device_dirty_with_no_device_support(void *user_context, const char *buffer_name); // @} /** Optional features a compilation Target can have. * Be sure to keep this in sync with the Feature enum in Target.h and the implementation of * get_runtime_compatible_target in Target.cpp if you add a new feature. */ typedef enum halide_target_feature_t { halide_target_feature_jit = 0, ///< Generate code that will run immediately inside the calling process. halide_target_feature_debug, ///< Turn on debug info and output for runtime code. halide_target_feature_no_asserts, ///< Disable all runtime checks, for slightly tighter code. halide_target_feature_no_bounds_query, ///< Disable the bounds querying functionality. halide_target_feature_sse41, ///< Use SSE 4.1 and earlier instructions. Only relevant on x86. halide_target_feature_avx, ///< Use AVX 1 instructions. Only relevant on x86. halide_target_feature_avx2, ///< Use AVX 2 instructions. Only relevant on x86. halide_target_feature_fma, ///< Enable x86 FMA instruction halide_target_feature_fma4, ///< Enable x86 (AMD) FMA4 instruction set halide_target_feature_f16c, ///< Enable x86 16-bit float support halide_target_feature_armv7s, ///< Generate code for ARMv7s. Only relevant for 32-bit ARM. halide_target_feature_no_neon, ///< Avoid using NEON instructions. Only relevant for 32-bit ARM. halide_target_feature_vsx, ///< Use VSX instructions. Only relevant on POWERPC. halide_target_feature_power_arch_2_07, ///< Use POWER ISA 2.07 new instructions. Only relevant on POWERPC. halide_target_feature_cuda, ///< Enable the CUDA runtime. Defaults to compute capability 2.0 (Fermi) halide_target_feature_cuda_capability30, ///< Enable CUDA compute capability 3.0 (Kepler) halide_target_feature_cuda_capability32, ///< Enable CUDA compute capability 3.2 (Tegra K1) halide_target_feature_cuda_capability35, ///< Enable CUDA compute capability 3.5 (Kepler) halide_target_feature_cuda_capability50, ///< Enable CUDA compute capability 5.0 (Maxwell) halide_target_feature_opencl, ///< Enable the OpenCL runtime. halide_target_feature_cl_doubles, ///< Enable double support on OpenCL targets halide_target_feature_cl_atomic64, ///< Enable 64-bit atomics operations on OpenCL targets halide_target_feature_opengl, ///< Enable the OpenGL runtime. halide_target_feature_openglcompute, ///< Enable OpenGL Compute runtime. halide_target_feature_user_context, ///< Generated code takes a user_context pointer as first argument halide_target_feature_matlab, ///< Generate a mexFunction compatible with Matlab mex libraries. See tools/mex_halide.m. halide_target_feature_profile, ///< Launch a sampling profiler alongside the Halide pipeline that monitors and reports the runtime used by each Func halide_target_feature_no_runtime, ///< Do not include a copy of the Halide runtime in any generated object file or assembly halide_target_feature_metal, ///< Enable the (Apple) Metal runtime. halide_target_feature_c_plus_plus_mangling, ///< Generate C++ mangled names for result function, et al halide_target_feature_large_buffers, ///< Enable 64-bit buffer indexing to support buffers > 2GB. Ignored if bits != 64. halide_target_feature_hvx_64, ///< Enable HVX 64 byte mode. halide_target_feature_hvx_128, ///< Enable HVX 128 byte mode. halide_target_feature_hvx_v62, ///< Enable Hexagon v62 architecture. halide_target_feature_fuzz_float_stores, ///< On every floating point store, set the last bit of the mantissa to zero. Pipelines for which the output is very different with this feature enabled may also produce very different output on different processors. halide_target_feature_soft_float_abi, ///< Enable soft float ABI. This only enables the soft float ABI calling convention, which does not necessarily use soft floats. halide_target_feature_msan, ///< Enable hooks for MSAN support. halide_target_feature_avx512, ///< Enable the base AVX512 subset supported by all AVX512 architectures. The specific feature sets are AVX-512F and AVX512-CD. See https://en.wikipedia.org/wiki/AVX-512 for a description of each AVX subset. halide_target_feature_avx512_knl, ///< Enable the AVX512 features supported by Knight's Landing chips, such as the Xeon Phi x200. This includes the base AVX512 set, and also AVX512-CD and AVX512-ER. halide_target_feature_avx512_skylake, ///< Enable the AVX512 features supported by Skylake Xeon server processors. This adds AVX512-VL, AVX512-BW, and AVX512-DQ to the base set. The main difference from the base AVX512 set is better support for small integer ops. Note that this does not include the Knight's Landing features. Note also that these features are not available on Skylake desktop and mobile processors. halide_target_feature_avx512_cannonlake, ///< Enable the AVX512 features expected to be supported by future Cannonlake processors. This includes all of the Skylake features, plus AVX512-IFMA and AVX512-VBMI. halide_target_feature_hvx_use_shared_object, ///< Deprecated halide_target_feature_trace_loads, ///< Trace all loads done by the pipeline. Equivalent to calling Func::trace_loads on every non-inlined Func. halide_target_feature_trace_stores, ///< Trace all stores done by the pipeline. Equivalent to calling Func::trace_stores on every non-inlined Func. halide_target_feature_trace_realizations, ///< Trace all realizations done by the pipeline. Equivalent to calling Func::trace_realizations on every non-inlined Func. halide_target_feature_trace_pipeline, ///< Trace the pipeline. halide_target_feature_cuda_capability61, ///< Enable CUDA compute capability 6.1 (Pascal) halide_target_feature_hvx_v65, ///< Enable Hexagon v65 architecture. halide_target_feature_hvx_v66, ///< Enable Hexagon v66 architecture. halide_target_feature_cl_half, ///< Enable half support on OpenCL targets halide_target_feature_strict_float, ///< Turn off all non-IEEE floating-point optimization. Currently applies only to LLVM targets. halide_target_feature_tsan, ///< Enable hooks for TSAN support. halide_target_feature_asan, ///< Enable hooks for ASAN support. halide_target_feature_d3d12compute, ///< Enable Direct3D 12 Compute runtime. halide_target_feature_check_unsafe_promises, ///< Insert assertions for promises. halide_target_feature_hexagon_dma, ///< Enable Hexagon DMA buffers. halide_target_feature_embed_bitcode, ///< Emulate clang -fembed-bitcode flag. halide_target_feature_enable_llvm_loop_opt, ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.) halide_target_feature_disable_llvm_loop_opt, ///< Disable loop vectorization + unrolling in LLVM. (Ignored for non-LLVM targets.) halide_target_feature_wasm_simd128, ///< Enable +simd128 instructions for WebAssembly codegen. halide_target_feature_wasm_signext, ///< Enable +sign-ext instructions for WebAssembly codegen. halide_target_feature_sve, ///< Enable ARM Scalable Vector Extensions halide_target_feature_sve2, ///< Enable ARM Scalable Vector Extensions v2 halide_target_feature_egl, ///< Force use of EGL support. halide_target_feature_end ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing. } halide_target_feature_t; /** This function is called internally by Halide in some situations to determine * if the current execution environment can support the given set of * halide_target_feature_t flags. The implementation must do the following: * * -- If there are flags set in features that the function knows *cannot* be supported, return 0. * -- Otherwise, return 1. * -- Note that any flags set in features that the function doesn't know how to test should be ignored; * this implies that a return value of 1 means "not known to be bad" rather than "known to be good". * * In other words: a return value of 0 means "It is not safe to use code compiled with these features", * while a return value of 1 means "It is not obviously unsafe to use code compiled with these features". * * The default implementation simply calls halide_default_can_use_target_features. * * Note that `features` points to an array of `count` uint64_t; this array must contain enough * bits to represent all the currently known features. Any excess bits must be set to zero. */ // @{ extern int halide_can_use_target_features(int count, const uint64_t *features); typedef int (*halide_can_use_target_features_t)(int count, const uint64_t *features); extern halide_can_use_target_features_t halide_set_custom_can_use_target_features(halide_can_use_target_features_t); // @} /** * This is the default implementation of halide_can_use_target_features; it is provided * for convenience of user code that may wish to extend halide_can_use_target_features * but continue providing existing support, e.g. * * int halide_can_use_target_features(int count, const uint64_t *features) { * if (features[halide_target_somefeature >> 6] & (1LL << (halide_target_somefeature & 63))) { * if (!can_use_somefeature()) { * return 0; * } * } * return halide_default_can_use_target_features(count, features); * } */ extern int halide_default_can_use_target_features(int count, const uint64_t *features); typedef struct halide_dimension_t { int32_t min, extent, stride; // Per-dimension flags. None are defined yet (This is reserved for future use). uint32_t flags; #ifdef __cplusplus HALIDE_ALWAYS_INLINE halide_dimension_t() : min(0), extent(0), stride(0), flags(0) { } HALIDE_ALWAYS_INLINE halide_dimension_t(int32_t m, int32_t e, int32_t s, uint32_t f = 0) : min(m), extent(e), stride(s), flags(f) { } HALIDE_ALWAYS_INLINE bool operator==(const halide_dimension_t &other) const { return (min == other.min) && (extent == other.extent) && (stride == other.stride) && (flags == other.flags); } HALIDE_ALWAYS_INLINE bool operator!=(const halide_dimension_t &other) const { return !(*this == other); } #endif } halide_dimension_t; #ifdef __cplusplus } // extern "C" #endif typedef enum { halide_buffer_flag_host_dirty = 1, halide_buffer_flag_device_dirty = 2 } halide_buffer_flags; /** * The raw representation of an image passed around by generated * Halide code. It includes some stuff to track whether the image is * not actually in main memory, but instead on a device (like a * GPU). For a more convenient C++ wrapper, use Halide::Buffer. */ typedef struct halide_buffer_t { /** A device-handle for e.g. GPU memory used to back this buffer. */ uint64_t device; /** The interface used to interpret the above handle. */ const struct halide_device_interface_t *device_interface; /** A pointer to the start of the data in main memory. In terms of * the Halide coordinate system, this is the address of the min * coordinates (defined below). */ uint8_t *host; /** flags with various meanings. */ uint64_t flags; /** The type of each buffer element. */ struct halide_type_t type; /** The dimensionality of the buffer. */ int32_t dimensions; /** The shape of the buffer. Halide does not own this array - you * must manage the memory for it yourself. */ halide_dimension_t *dim; /** Pads the buffer up to a multiple of 8 bytes */ void *padding; #ifdef __cplusplus /** Convenience methods for accessing the flags */ // @{ HALIDE_ALWAYS_INLINE bool get_flag(halide_buffer_flags flag) const { return (flags & flag) != 0; } HALIDE_ALWAYS_INLINE void set_flag(halide_buffer_flags flag, bool value) { if (value) { flags |= flag; } else { flags &= ~flag; } } HALIDE_ALWAYS_INLINE bool host_dirty() const { return get_flag(halide_buffer_flag_host_dirty); } HALIDE_ALWAYS_INLINE bool device_dirty() const { return get_flag(halide_buffer_flag_device_dirty); } HALIDE_ALWAYS_INLINE void set_host_dirty(bool v = true) { set_flag(halide_buffer_flag_host_dirty, v); } HALIDE_ALWAYS_INLINE void set_device_dirty(bool v = true) { set_flag(halide_buffer_flag_device_dirty, v); } // @} /** The total number of elements this buffer represents. Equal to * the product of the extents */ HALIDE_ALWAYS_INLINE size_t number_of_elements() const { size_t s = 1; for (int i = 0; i < dimensions; i++) { s *= dim[i].extent; } return s; } /** A pointer to the element with the lowest address. If all * strides are positive, equal to the host pointer. */ HALIDE_ALWAYS_INLINE uint8_t *begin() const { ptrdiff_t index = 0; for (int i = 0; i < dimensions; i++) { if (dim[i].stride < 0) { index += dim[i].stride * (dim[i].extent - 1); } } return host + index * type.bytes(); } /** A pointer to one beyond the element with the highest address. */ HALIDE_ALWAYS_INLINE uint8_t *end() const { ptrdiff_t index = 0; for (int i = 0; i < dimensions; i++) { if (dim[i].stride > 0) { index += dim[i].stride * (dim[i].extent - 1); } } index += 1; return host + index * type.bytes(); } /** The total number of bytes spanned by the data in memory. */ HALIDE_ALWAYS_INLINE size_t size_in_bytes() const { return (size_t)(end() - begin()); } /** A pointer to the element at the given location. */ HALIDE_ALWAYS_INLINE uint8_t *address_of(const int *pos) const { ptrdiff_t index = 0; for (int i = 0; i < dimensions; i++) { index += dim[i].stride * (pos[i] - dim[i].min); } return host + index * type.bytes(); } /** Attempt to call device_sync for the buffer. If the buffer * has no device_interface (or no device_sync), this is a quiet no-op. * Calling this explicitly should rarely be necessary, except for profiling. */ HALIDE_ALWAYS_INLINE int device_sync(void *ctx = NULL) { if (device_interface && device_interface->device_sync) { return device_interface->device_sync(ctx, this); } return 0; } /** Check if an input buffer passed extern stage is a querying * bounds. Compared to doing the host pointer check directly, * this both adds clarity to code and will facilitate moving to * another representation for bounds query arguments. */ HALIDE_ALWAYS_INLINE bool is_bounds_query() const { return host == NULL && device == 0; } #endif } halide_buffer_t; #ifdef __cplusplus extern "C" { #endif #ifndef HALIDE_ATTRIBUTE_DEPRECATED #ifdef HALIDE_ALLOW_DEPRECATED #define HALIDE_ATTRIBUTE_DEPRECATED(x) #else #ifdef _MSC_VER #define HALIDE_ATTRIBUTE_DEPRECATED(x) __declspec(deprecated(x)) #else #define HALIDE_ATTRIBUTE_DEPRECATED(x) __attribute__((deprecated(x))) #endif #endif #endif /** halide_scalar_value_t is a simple union able to represent all the well-known * scalar values in a filter argument. Note that it isn't tagged with a type; * you must ensure you know the proper type before accessing. Most user * code will never need to create instances of this struct; its primary use * is to hold def/min/max values in a halide_filter_argument_t. (Note that * this is conceptually just a union; it's wrapped in a struct to ensure * that it doesn't get anonymized by LLVM.) */ struct halide_scalar_value_t { union { bool b; int8_t i8; int16_t i16; int32_t i32; int64_t i64; uint8_t u8; uint16_t u16; uint32_t u32; uint64_t u64; float f32; double f64; void *handle; } u; #ifdef __cplusplus HALIDE_ALWAYS_INLINE halide_scalar_value_t() { u.u64 = 0; } #endif }; enum halide_argument_kind_t { halide_argument_kind_input_scalar = 0, halide_argument_kind_input_buffer = 1, halide_argument_kind_output_buffer = 2 }; /* These structs must be robust across different compilers and settings; when modifying them, strive for the following rules: 1) All fields are explicitly sized. I.e. must use int32_t and not "int" 2) All fields must land on an alignment boundary that is the same as their size 3) Explicit padding is added to make that so 4) The sizeof the struct is padded out to a multiple of the largest natural size thing in the struct 5) don't forget that 32 and 64 bit pointers are different sizes */ /** * Obsolete version of halide_filter_argument_t; only present in * code that wrote halide_filter_metadata_t version 0. */ struct halide_filter_argument_t_v0 { const char *name; int32_t kind; int32_t dimensions; struct halide_type_t type; const struct halide_scalar_value_t *def, *min, *max; }; /** * halide_filter_argument_t is essentially a plain-C-struct equivalent to * Halide::Argument; most user code will never need to create one. */ struct halide_filter_argument_t { const char *name; // name of the argument; will never be null or empty. int32_t kind; // actually halide_argument_kind_t int32_t dimensions; // always zero for scalar arguments struct halide_type_t type; // These pointers should always be null for buffer arguments, // and *may* be null for scalar arguments. (A null value means // there is no def/min/max/estimate specified for this argument.) const struct halide_scalar_value_t *scalar_def, *scalar_min, *scalar_max, *scalar_estimate; // This pointer should always be null for scalar arguments, // and *may* be null for buffer arguments. If not null, it should always // point to an array of dimensions*2 pointers, which will be the (min, extent) // estimates for each dimension of the buffer. (Note that any of the pointers // may be null as well.) int64_t const *const *buffer_estimates; }; struct halide_filter_metadata_t { #ifdef __cplusplus static const int32_t VERSION = 1; #endif /** version of this metadata; currently always 1. */ int32_t version; /** The number of entries in the arguments field. This is always >= 1. */ int32_t num_arguments; /** An array of the filters input and output arguments; this will never be * null. The order of arguments is not guaranteed (input and output arguments * may come in any order); however, it is guaranteed that all arguments * will have a unique name within a given filter. */ const struct halide_filter_argument_t *arguments; /** The Target for which the filter was compiled. This is always * a canonical Target string (ie a product of Target::to_string). */ const char *target; /** The function name of the filter. */ const char *name; }; /** halide_register_argv_and_metadata() is a **user-defined** function that * must be provided in order to use the registration.cc files produced * by Generators when the 'registration' output is requested. Each registration.cc * file provides a static initializer that calls this function with the given * filter's argv-call variant, its metadata, and (optionally) and additional * textual data that the build system chooses to tack on for its own purposes. * Note that this will be called at static-initializer time (i.e., before * main() is called), and in an unpredictable order. Note that extra_key_value_pairs * may be nullptr; if it's not null, it's expected to be a null-terminated list * of strings, with an even number of entries. */ void halide_register_argv_and_metadata( int (*filter_argv_call)(void **), const struct halide_filter_metadata_t *filter_metadata, const char *const *extra_key_value_pairs); /** The functions below here are relevant for pipelines compiled with * the -profile target flag, which runs a sampling profiler thread * alongside the pipeline. */ /** Per-Func state tracked by the sampling profiler. */ struct halide_profiler_func_stats { /** Total time taken evaluating this Func (in nanoseconds). */ uint64_t time; /** The current memory allocation of this Func. */ uint64_t memory_current; /** The peak memory allocation of this Func. */ uint64_t memory_peak; /** The total memory allocation of this Func. */ uint64_t memory_total; /** The peak stack allocation of this Func's threads. */ uint64_t stack_peak; /** The average number of thread pool worker threads active while computing this Func. */ uint64_t active_threads_numerator, active_threads_denominator; /** The name of this Func. A global constant string. */ const char *name; /** The total number of memory allocation of this Func. */ int num_allocs; }; /** Per-pipeline state tracked by the sampling profiler. These exist * in a linked list. */ struct halide_profiler_pipeline_stats { /** Total time spent inside this pipeline (in nanoseconds) */ uint64_t time; /** The current memory allocation of funcs in this pipeline. */ uint64_t memory_current; /** The peak memory allocation of funcs in this pipeline. */ uint64_t memory_peak; /** The total memory allocation of funcs in this pipeline. */ uint64_t memory_total; /** The average number of thread pool worker threads doing useful * work while computing this pipeline. */ uint64_t active_threads_numerator, active_threads_denominator; /** The name of this pipeline. A global constant string. */ const char *name; /** An array containing states for each Func in this pipeline. */ struct halide_profiler_func_stats *funcs; /** The next pipeline_stats pointer. It's a void * because types * in the Halide runtime may not currently be recursive. */ void *next; /** The number of funcs in this pipeline. */ int num_funcs; /** An internal base id used to identify the funcs in this pipeline. */ int first_func_id; /** The number of times this pipeline has been run. */ int runs; /** The total number of samples taken inside of this pipeline. */ int samples; /** The total number of memory allocation of funcs in this pipeline. */ int num_allocs; }; /** The global state of the profiler. */ struct halide_profiler_state { /** Guards access to the fields below. If not locked, the sampling * profiler thread is free to modify things below (including * reordering the linked list of pipeline stats). */ struct halide_mutex lock; /** The amount of time the profiler thread sleeps between samples * in milliseconds. Defaults to 1 */ int sleep_time; /** An internal id used for bookkeeping. */ int first_free_id; /** The id of the current running Func. Set by the pipeline, read * periodically by the profiler thread. */ int current_func; /** The number of threads currently doing work. */ int active_threads; /** A linked list of stats gathered for each pipeline. */ struct halide_profiler_pipeline_stats *pipelines; /** Retrieve remote profiler state. Used so that the sampling * profiler can follow along with execution that occurs elsewhere, * e.g. on a DSP. If null, it reads from the int above instead. */ void (*get_remote_profiler_state)(int *func, int *active_workers); /** Sampling thread reference to be joined at shutdown. */ struct halide_thread *sampling_thread; }; /** Profiler func ids with special meanings. */ enum { /// current_func takes on this value when not inside Halide code halide_profiler_outside_of_halide = -1, /// Set current_func to this value to tell the profiling thread to /// halt. It will start up again next time you run a pipeline with /// profiling enabled. halide_profiler_please_stop = -2 }; /** Get a pointer to the global profiler state for programmatic * inspection. Lock it before using to pause the profiler. */ extern struct halide_profiler_state *halide_profiler_get_state(); /** Get a pointer to the pipeline state associated with pipeline_name. * This function grabs the global profiler state's lock on entry. */ extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const char *pipeline_name); /** Reset profiler state cheaply. May leave threads running or some * memory allocated but all accumluated statistics are reset. * WARNING: Do NOT call this method while any halide pipeline is * running; halide_profiler_memory_allocate/free and * halide_profiler_stack_peak_update update the profiler pipeline's * state without grabbing the global profiler state's lock. */ extern void halide_profiler_reset(); /** Reset all profiler state. * WARNING: Do NOT call this method while any halide pipeline is * running; halide_profiler_memory_allocate/free and * halide_profiler_stack_peak_update update the profiler pipeline's * state without grabbing the global profiler state's lock. */ void halide_profiler_shutdown(); /** Print out timing statistics for everything run since the last * reset. Also happens at process exit. */ extern void halide_profiler_report(void *user_context); /// \name "Float16" functions /// These functions operate of bits (``uint16_t``) representing a half /// precision floating point number (IEEE-754 2008 binary16). //{@ /** Read bits representing a half precision floating point number and return * the float that represents the same value */ extern float halide_float16_bits_to_float(uint16_t); /** Read bits representing a half precision floating point number and return * the double that represents the same value */ extern double halide_float16_bits_to_double(uint16_t); // TODO: Conversion functions to half //@} // Allocating and freeing device memory is often very slow. The // methods below give Halide's runtime permission to hold onto device // memory to service future requests instead of returning it to the // underlying device API. The API does not manage an allocation pool, // all it does is provide access to a shared counter that acts as a // limit on the unused memory not yet returned to the underlying // device API. It makes callbacks to participants when memory needs to // be released because the limit is about to be exceeded (either // because the limit has been reduced, or because the memory owned by // some participant becomes unused). /** Tell Halide whether or not it is permitted to hold onto device * allocations to service future requests instead of returning them * eagerly to the underlying device API. Many device allocators are * quite slow, so it can be beneficial to set this to true. The * default value for now is false. * * Note that if enabled, the eviction policy is very simplistic. The * 32 most-recently used allocations are preserved, regardless of * their size. Additionally, if a call to cuMalloc results in an * out-of-memory error, the entire cache is flushed and the allocation * is retried. See https://github.com/halide/Halide/issues/4093 * * If set to false, releases all unused device allocations back to the * underlying device APIs. For finer-grained control, see specific * methods in each device api runtime. */ extern int halide_reuse_device_allocations(void *user_context, bool); /** Determines whether on device_free the memory is returned * immediately to the device API, or placed on a free list for future * use. Override and switch based on the user_context for * finer-grained control. By default just returns the value most * recently set by the method above. */ extern bool halide_can_reuse_device_allocations(void *user_context); struct halide_device_allocation_pool { int (*release_unused)(void *user_context); struct halide_device_allocation_pool *next; }; /** Register a callback to be informed when * halide_reuse_device_allocations(false) is called, and all unused * device allocations must be released. The object passed should have * global lifetime, and its next field will be clobbered. */ extern void halide_register_device_allocation_pool(struct halide_device_allocation_pool *); #ifdef __cplusplus } // End extern "C" #endif #ifdef __cplusplus namespace { template struct check_is_pointer; template struct check_is_pointer {}; } // namespace /** Construct the halide equivalent of a C type */ template HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { // Create a compile-time error if T is not a pointer (without // using any includes - this code goes into the runtime). check_is_pointer check; (void)check; return halide_type_t(halide_type_handle, 64); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_float, 32); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_float, 64); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_uint, 1); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_uint, 8); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_uint, 16); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_uint, 32); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_uint, 64); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_int, 8); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_int, 16); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_int, 32); } template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_int, 64); } #endif #endif // HALIDE_HALIDERUNTIME_H namespace Halide { namespace Internal { /** A class representing a reference count to be used with IntrusivePtr */ class RefCount { std::atomic count; public: RefCount() noexcept : count(0) { } int increment() { return ++count; } // Increment and return new value int decrement() { return --count; } // Decrement and return new value bool is_zero() const { return count == 0; } }; /** * Because in this header we don't yet know how client classes store * their RefCount (and we don't want to depend on the declarations of * the client classes), any class that you want to hold onto via one * of these must provide implementations of ref_count and destroy, * which we forward-declare here. * * E.g. if you want to use IntrusivePtr, then you should * define something like this in MyClass.cpp (assuming MyClass has * a field: mutable RefCount ref_count): * * template<> RefCount &ref_count(const MyClass *c) noexcept {return c->ref_count;} * template<> void destroy(const MyClass *c) {delete c;} */ // @{ template RefCount &ref_count(const T *t) noexcept; template void destroy(const T *t); // @} /** Intrusive shared pointers have a reference count (a * RefCount object) stored in the class itself. This is perhaps more * efficient than storing it externally, but more importantly, it * means it's possible to recover a reference-counted handle from the * raw pointer, and it's impossible to have two different reference * counts attached to the same raw object. Seeing as we pass around * raw pointers to concrete IRNodes and Expr's interchangeably, this * is a useful property. */ template struct IntrusivePtr { private: void incref(T *p) { if (p) { ref_count(p).increment(); } }; void decref(T *p) { if (p) { // Note that if the refcount is already zero, then we're // in a recursive destructor due to a self-reference (a // cycle), where the ref_count has been adjusted to remove // the counts due to the cycle. The next line then makes // the ref_count negative, which prevents actually // entering the destructor recursively. if (ref_count(p).decrement() == 0) { destroy(p); } } } protected: T *ptr = nullptr; public: /** Access the raw pointer in a variety of ways. * Note that a "const IntrusivePtr" is not the same thing as an * IntrusivePtr. So the methods that return the ptr are * const, despite not adding an extra const to T. */ // @{ T *get() const { return ptr; } T &operator*() const { return *ptr; } T *operator->() const { return ptr; } // @} ~IntrusivePtr() { decref(ptr); } HALIDE_ALWAYS_INLINE IntrusivePtr() = default; HALIDE_ALWAYS_INLINE IntrusivePtr(T *p) : ptr(p) { incref(ptr); } HALIDE_ALWAYS_INLINE IntrusivePtr(const IntrusivePtr &other) noexcept : ptr(other.ptr) { incref(ptr); } HALIDE_ALWAYS_INLINE IntrusivePtr(IntrusivePtr &&other) noexcept : ptr(other.ptr) { other.ptr = nullptr; } IntrusivePtr &operator=(const IntrusivePtr &other) { if (other.ptr == ptr) return *this; // Other can be inside of something owned by this, so we // should be careful to incref other before we decref // ourselves. T *temp = other.ptr; incref(temp); decref(ptr); ptr = temp; return *this; } IntrusivePtr &operator=(IntrusivePtr &&other) noexcept { std::swap(ptr, other.ptr); return *this; } /* Handles can be null. This checks that. */ HALIDE_ALWAYS_INLINE bool defined() const { return ptr != nullptr; } /* Check if two handles point to the same ptr. This is * equality of reference, not equality of value. */ HALIDE_ALWAYS_INLINE bool same_as(const IntrusivePtr &other) const { return ptr == other.ptr; } HALIDE_ALWAYS_INLINE bool operator<(const IntrusivePtr &other) const { return ptr < other.ptr; } }; } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_TYPE_H #define HALIDE_TYPE_H #ifndef HALIDE_ERROR_H #define HALIDE_ERROR_H #include #include #ifndef HALIDE_DEBUG_H #define HALIDE_DEBUG_H /** \file * Defines functions for debug logging during code generation. */ #include #include #include namespace Halide { struct Expr; struct Type; // Forward declare some things from IRPrinter, which we can't include yet. std::ostream &operator<<(std::ostream &stream, const Expr &); std::ostream &operator<<(std::ostream &stream, const Type &); class Module; std::ostream &operator<<(std::ostream &stream, const Module &); struct Target; /** Emit a halide Target in a human readable form */ std::ostream &operator<<(std::ostream &stream, const Target &); namespace Internal { struct Stmt; std::ostream &operator<<(std::ostream &stream, const Stmt &); struct LoweredFunc; std::ostream &operator<<(std::ostream &, const LoweredFunc &); /** For optional debugging during codegen, use the debug class as * follows: * \code debug(verbosity) << "The expression is " << expr << "\n"; \endcode * * verbosity of 0 always prints, 1 should print after every major * stage, 2 should be used for more detail, and 3 should be used for * tracing everything that occurs. The verbosity with which to print * is determined by the value of the environment variable * HL_DEBUG_CODEGEN */ class debug { const bool logging; public: debug(int verbosity) : logging(verbosity <= debug_level()) { } template debug &operator<<(T &&x) { if (logging) { std::cerr << std::forward(x); } return *this; } static int debug_level(); }; } // namespace Internal } // namespace Halide #endif namespace Halide { /** Query whether Halide was compiled with exceptions. */ bool exceptions_enabled(); /** A base class for Halide errors. */ struct Error : public std::runtime_error { // Give each class a non-inlined constructor so that the type // doesn't get separately instantiated in each compilation unit. Error(const std::string &msg); }; /** An error that occurs while running a JIT-compiled Halide pipeline. */ struct RuntimeError : public Error { RuntimeError(const std::string &msg); }; /** An error that occurs while compiling a Halide pipeline that Halide * attributes to a user error. */ struct CompileError : public Error { CompileError(const std::string &msg); }; /** An error that occurs while compiling a Halide pipeline that Halide * attributes to an internal compiler bug, or to an invalid use of * Halide's internals. */ struct InternalError : public Error { InternalError(const std::string &msg); }; /** CompileTimeErrorReporter is used at compile time (*not* runtime) when * an error or warning is generated by Halide. Note that error() is called * a fatal error has occurred, and returning to Halide may cause a crash; * implementations of CompileTimeErrorReporter::error() should never return. * (Implementations of CompileTimeErrorReporter::warning() may return but * may also abort(), exit(), etc.) */ class CompileTimeErrorReporter { public: virtual ~CompileTimeErrorReporter() = default; virtual void warning(const char *msg) = 0; virtual void error(const char *msg) = 0; }; /** The default error reporter logs to stderr, then throws an exception * (if WITH_EXCEPTIONS) or calls abort (if not). This allows customization * of that behavior if a more gentle response to error reporting is desired. * Note that error_reporter is expected to remain valid across all Halide usage; * it is up to the caller to ensure that this is the case (and to do any * cleanup necessary). */ void set_custom_compile_time_error_reporter(CompileTimeErrorReporter *error_reporter); namespace Internal { struct ErrorReport { enum { User = 0x0001, Warning = 0x0002, Runtime = 0x0004 }; std::ostringstream msg; const int flags; ErrorReport(const char *f, int l, const char *cs, int flags); // Just a trick used to convert RValue into LValue HALIDE_ALWAYS_INLINE ErrorReport &ref() { return *this; } template ErrorReport &operator<<(const T &x) { msg << x; return *this; } /** When you're done using << on the object, and let it fall out of * scope, this errors out, or throws an exception if they are * enabled. This is a little dangerous because the destructor will * also be called if there's an exception in flight due to an * error in one of the arguments passed to operator<<. We handle * this by only actually throwing if there isn't an exception in * flight already. */ #if __cplusplus >= 201100 || _MSC_VER >= 1900 ~ErrorReport() noexcept(false); #else ~ErrorReport(); #endif }; // This uses operator precedence as a trick to avoid argument evaluation if // an assertion is true: it is intended to be used as part of the // _halide_internal_assertion macro, to coerce the result of the stream // expression to void (to match the condition-is-false case). class Voidifier { public: HALIDE_ALWAYS_INLINE Voidifier() = default; // This has to be an operator with a precedence lower than << but // higher than ?: HALIDE_ALWAYS_INLINE void operator&(ErrorReport &) { } }; /** * _halide_internal_assertion is used to implement our assertion macros * in such a way that the messages output for the assertion are only * evaluated if the assertion's value is false. * * Note that this macro intentionally has no parens internally; in actual * use, the implicit grouping will end up being * * condition ? (void) : (Voidifier() & (ErrorReport << arg1 << arg2 ... << argN)) * * This (regrettably) requires a macro to work, but has the highly desirable * effect that all assertion parameters are totally skipped (not ever evaluated) * when the assertion is true. */ #define _halide_internal_assertion(condition, flags) \ (condition) ? (void)0 : ::Halide::Internal::Voidifier() & ::Halide::Internal::ErrorReport(__FILE__, __LINE__, #condition, flags).ref() #define internal_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, 0) #define user_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User) #define user_warning Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Warning) #define halide_runtime_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Runtime) #define internal_assert(c) _halide_internal_assertion(c, 0) #define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) // The nicely named versions get cleaned up at the end of Halide.h, // but user code might want to do halide-style user_asserts (e.g. the // Extern macros introduce calls to user_assert), so for that purpose // we define an equivalent macro that can be used outside of Halide.h #define _halide_user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) // N.B. Any function that might throw a user_assert or user_error may // not be inlined into the user's code, or the line number will be // misattributed to Halide.h. Either make such functions internal to // libHalide, or mark them as HALIDE_NO_USER_CODE_INLINE. } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_FLOAT16_H #define HALIDE_FLOAT16_H #include #include namespace Halide { /** Class that provides a type that implements half precision * floating point (IEEE754 2008 binary16) in software. * * This type is enforced to be 16-bits wide and maintains no state * other than the raw IEEE754 binary16 bits so that it can passed * to code that checks a type's size and used for halide_buffer_t allocation. * */ struct float16_t { static const int mantissa_bits = 10; static const uint16_t sign_mask = 0x8000; static const uint16_t exponent_mask = 0x7c00; static const uint16_t mantissa_mask = 0x03ff; /// \name Constructors /// @{ /** Construct from a float, double, or int using * round-to-nearest-ties-to-even. Out-of-range values become +/- * infinity. */ // @{ explicit float16_t(float value); explicit float16_t(double value); explicit float16_t(int value); // @} /** Construct a float16_t with the bits initialised to 0. This represents * positive zero.*/ float16_t() = default; /// @} // Use explicit to avoid accidently raising the precision /** Cast to float */ explicit operator float() const; /** Cast to double */ explicit operator double() const; /** Cast to int */ explicit operator int() const; /** Get a new float16_t that represents a special value */ // @{ static float16_t make_zero(); static float16_t make_negative_zero(); static float16_t make_infinity(); static float16_t make_negative_infinity(); static float16_t make_nan(); // @} /** Get a new float16_t with the given raw bits * * \param bits The bits conformant to IEEE754 binary16 */ static float16_t make_from_bits(uint16_t bits); /** Return a new float16_t with a negated sign bit*/ float16_t operator-() const; /** Arithmetic operators. */ // @{ float16_t operator+(float16_t rhs) const; float16_t operator-(float16_t rhs) const; float16_t operator*(float16_t rhs) const; float16_t operator/(float16_t rhs) const; float16_t operator+=(float16_t rhs) { return (*this = *this + rhs); } float16_t operator-=(float16_t rhs) { return (*this = *this - rhs); } float16_t operator*=(float16_t rhs) { return (*this = *this * rhs); } float16_t operator/=(float16_t rhs) { return (*this = *this / rhs); } // @} /** Comparison operators */ // @{ bool operator==(float16_t rhs) const; bool operator!=(float16_t rhs) const { return !(*this == rhs); } bool operator>(float16_t rhs) const; bool operator<(float16_t rhs) const; bool operator>=(float16_t rhs) const { return (*this > rhs) || (*this == rhs); } bool operator<=(float16_t rhs) const { return (*this < rhs) || (*this == rhs); } // @} /** Properties */ // @{ bool is_nan() const; bool is_infinity() const; bool is_negative() const; bool is_zero() const; // @} /** Returns the bits that represent this float16_t. * * An alternative method to access the bits is to cast a pointer * to this instance as a pointer to a uint16_t. **/ uint16_t to_bits() const; private: // The raw bits. uint16_t data = 0; }; static_assert(sizeof(float16_t) == 2, "float16_t should occupy two bytes"); } // namespace Halide template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_float, 16); } namespace Halide { /** Class that provides a type that implements half precision * floating point using the bfloat16 format. * * This type is enforced to be 16-bits wide and maintains no state * other than the raw bits so that it can passed to code that checks * a type's size and used for halide_buffer_t allocation. */ struct bfloat16_t { static const int mantissa_bits = 7; static const uint16_t sign_mask = 0x8000; static const uint16_t exponent_mask = 0x7f80; static const uint16_t mantissa_mask = 0x007f; static const bfloat16_t zero, negative_zero, infinity, negative_infinity, nan; /// \name Constructors /// @{ /** Construct from a float, double, or int using * round-to-nearest-ties-to-even. Out-of-range values become +/- * infinity. */ // @{ explicit bfloat16_t(float value); explicit bfloat16_t(double value); explicit bfloat16_t(int value); // @} /** Construct a bfloat16_t with the bits initialised to 0. This represents * positive zero.*/ bfloat16_t() = default; /// @} // Use explicit to avoid accidently raising the precision /** Cast to float */ explicit operator float() const; /** Cast to double */ explicit operator double() const; /** Cast to int */ explicit operator int() const; /** Get a new bfloat16_t that represents a special value */ // @{ static bfloat16_t make_zero(); static bfloat16_t make_negative_zero(); static bfloat16_t make_infinity(); static bfloat16_t make_negative_infinity(); static bfloat16_t make_nan(); // @} /** Get a new bfloat16_t with the given raw bits * * \param bits The bits conformant to IEEE754 binary16 */ static bfloat16_t make_from_bits(uint16_t bits); /** Return a new bfloat16_t with a negated sign bit*/ bfloat16_t operator-() const; /** Arithmetic operators. */ // @{ bfloat16_t operator+(bfloat16_t rhs) const; bfloat16_t operator-(bfloat16_t rhs) const; bfloat16_t operator*(bfloat16_t rhs) const; bfloat16_t operator/(bfloat16_t rhs) const; bfloat16_t operator+=(bfloat16_t rhs) { return (*this = *this + rhs); } bfloat16_t operator-=(bfloat16_t rhs) { return (*this = *this - rhs); } bfloat16_t operator*=(bfloat16_t rhs) { return (*this = *this * rhs); } bfloat16_t operator/=(bfloat16_t rhs) { return (*this = *this / rhs); } // @} /** Comparison operators */ // @{ bool operator==(bfloat16_t rhs) const; bool operator!=(bfloat16_t rhs) const { return !(*this == rhs); } bool operator>(bfloat16_t rhs) const; bool operator<(bfloat16_t rhs) const; bool operator>=(bfloat16_t rhs) const { return (*this > rhs) || (*this == rhs); } bool operator<=(bfloat16_t rhs) const { return (*this < rhs) || (*this == rhs); } // @} /** Properties */ // @{ bool is_nan() const; bool is_infinity() const; bool is_negative() const; bool is_zero() const; // @} /** Returns the bits that represent this bfloat16_t. * * An alternative method to access the bits is to cast a pointer * to this instance as a pointer to a uint16_t. **/ uint16_t to_bits() const; private: // The raw bits. uint16_t data = 0; }; static_assert(sizeof(bfloat16_t) == 2, "bfloat16_t should occupy two bytes"); } // namespace Halide template<> HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() { return halide_type_t(halide_type_bfloat, 16); } #endif // Always use assert, even if llvm-config defines NDEBUG #ifdef NDEBUG #undef NDEBUG #include #define NDEBUG #else #include #endif #ifndef HALIDE_UTIL_H #define HALIDE_UTIL_H /** \file * Various utility functions used internally Halide. */ #include #include #include #include #include #include #ifndef HALIDE_EXPORT #if defined(_MSC_VER) // Halide_EXPORTS is quietly defined by CMake when building a shared library #ifdef Halide_EXPORTS #define HALIDE_EXPORT __declspec(dllexport) #else #define HALIDE_EXPORT __declspec(dllimport) #endif #else #define HALIDE_EXPORT #endif #endif // If we're in user code, we don't want certain functions to be inlined. #if defined(COMPILING_HALIDE) || defined(BUILDING_PYTHON) #define HALIDE_NO_USER_CODE_INLINE #else #define HALIDE_NO_USER_CODE_INLINE HALIDE_NEVER_INLINE #endif // On windows, Halide needs a larger stack than the default MSVC provides #ifdef _MSC_VER #pragma comment(linker, "/STACK:8388608,1048576") #endif namespace Halide { /** Load a plugin in the form of a dynamic library (e.g. for custom autoschedulers). * If the string doesn't contain any . characters, the proper prefix and/or suffix * for the platform will be added: * * foo -> libfoo.so (Linux/OSX/etc -- note that .dylib is not supported) * foo -> foo.dll (Windows) * * otherwise, it is assumed to be an appropriate pathname. * * Any error in loading will assert-fail. */ void load_plugin(const std::string &lib_name); namespace Internal { /** Some numeric conversions are UB if the value won't fit in the result; * safe_numeric_cast<>() is meant as a drop-in replacement for a C/C++ cast * that adds well-defined behavior for the UB cases, attempting to mimic * common implementation behavior as much as possible. */ template::value>::type * = nullptr> DST safe_numeric_cast(SRC s) { if (std::is_integral::value) { // Treat float -> int as a saturating cast; this is handled // in different ways by different compilers, so an arbitrary but safe // choice like this is reasonable. if (s < (SRC)std::numeric_limits::min()) { return std::numeric_limits::min(); } if (s > (SRC)std::numeric_limits::max()) { return std::numeric_limits::max(); } } return (DST)s; } template::value>::type * = nullptr> DST safe_numeric_cast(SRC s) { if (std::is_integral::value) { // any-int -> signed-int is technically UB if value won't fit; // in practice, common compilers implement such conversions as done below // (as verified by exhaustive testing on Clang for x86-64). We could // probably continue to rely on that behavior, but making it explicit // avoids possible wrather of UBSan and similar debug helpers. // (Yes, using sizeof for this comparison is a little odd for the uint->int // case, but the intent is to match existing common behavior, which this does.) if (std::is_integral::value && std::is_signed::value && sizeof(DST) < sizeof(SRC)) { using UnsignedSrc = typename std::make_unsigned::type; return (DST)(s & (UnsignedSrc)(-1)); } } return (DST)s; } /** An aggressive form of reinterpret cast used for correct type-punning. */ template DstType reinterpret_bits(const SrcType &src) { static_assert(sizeof(SrcType) == sizeof(DstType), "Types must be same size"); DstType dst; memcpy(&dst, &src, sizeof(SrcType)); return dst; } /** Make a unique name for an object based on the name of the stack * variable passed in. If introspection isn't working or there are no * debug symbols, just uses unique_name with the given prefix. */ std::string make_entity_name(void *stack_ptr, const std::string &type, char prefix); /** Get value of an environment variable. Returns its value * is defined in the environment. If the var is not defined, an empty string * is returned. */ std::string get_env_variable(char const *env_var_name); /** Get the name of the currently running executable. Platform-specific. * If program name cannot be retrieved, function returns an empty string. */ std::string running_program_name(); /** Generate a unique name starting with the given prefix. It's unique * relative to all other strings returned by unique_name in this * process. * * The single-character version always appends a numeric suffix to the * character. * * The string version will either return the input as-is (with high * probability on the first time it is called with that input), or * replace any existing '$' characters with underscores, then add a * '$' sign and a numeric suffix to it. * * Note that unique_name('f') therefore differs from * unique_name("f"). The former returns something like f123, and the * latter returns either f or f$123. */ // @{ std::string unique_name(char prefix); std::string unique_name(const std::string &prefix); // @} /** Test if the first string starts with the second string */ bool starts_with(const std::string &str, const std::string &prefix); /** Test if the first string ends with the second string */ bool ends_with(const std::string &str, const std::string &suffix); /** Replace all matches of the second string in the first string with the last string */ std::string replace_all(const std::string &str, const std::string &find, const std::string &replace); /** Split the source string using 'delim' as the divider. */ std::vector split_string(const std::string &source, const std::string &delim); /** Perform a left fold of a vector. Returns a default-constructed * vector element if the vector is empty. Similar to std::accumulate * but with a less clunky syntax. */ template T fold_left(const std::vector &vec, Fn f) { T result; if (vec.empty()) { return result; } result = vec[0]; for (size_t i = 1; i < vec.size(); i++) { result = f(result, vec[i]); } return result; } /** Returns a right fold of a vector. Returns a default-constructed * vector element if the vector is empty. */ template T fold_right(const std::vector &vec, Fn f) { T result; if (vec.empty()) { return result; } result = vec.back(); for (size_t i = vec.size() - 1; i > 0; i--) { result = f(vec[i - 1], result); } return result; } template struct meta_and : std::true_type {}; template struct meta_and : std::integral_constant::value> {}; template struct meta_or : std::false_type {}; template struct meta_or : std::integral_constant::value> {}; template struct all_are_convertible : meta_and...> {}; /** Returns base name and fills in namespaces, outermost one first in vector. */ std::string extract_namespaces(const std::string &name, std::vector &namespaces); struct FileStat { uint64_t file_size; uint32_t mod_time; // Unix epoch time uint32_t uid; uint32_t gid; uint32_t mode; }; /** Create a unique file with a name of the form prefixXXXXXsuffix in an arbitrary * (but writable) directory; this is typically /tmp, but the specific * location is not guaranteed. (Note that the exact form of the file name * may vary; in particular, the suffix may be ignored on Windows.) * The file is created (but not opened), thus this can be called from * different threads (or processes, e.g. when building with parallel make) * without risking collision. Note that if this file is used as a temporary * file, the caller is responsibly for deleting it. Neither the prefix nor suffix * may contain a directory separator. */ std::string file_make_temp(const std::string &prefix, const std::string &suffix); /** Create a unique directory in an arbitrary (but writable) directory; this is * typically somewhere inside /tmp, but the specific location is not guaranteed. * The directory will be empty (i.e., this will never return /tmp itself, * but rather a new directory inside /tmp). The caller is responsible for removing the * directory after use. */ std::string dir_make_temp(); /** Wrapper for access(). Quietly ignores errors. */ bool file_exists(const std::string &name); /** assert-fail if the file doesn't exist. useful primarily for testing purposes. */ void assert_file_exists(const std::string &name); /** assert-fail if the file DOES exist. useful primarily for testing purposes. */ void assert_no_file_exists(const std::string &name); /** Wrapper for unlink(). Asserts upon error. */ void file_unlink(const std::string &name); /** Wrapper for unlink(). Quietly ignores errors. */ void file_unlink(const std::string &name); /** Ensure that no file with this path exists. If such a file * exists and cannot be removed, assert-fail. */ void ensure_no_file_exists(const std::string &name); /** Wrapper for rmdir(). Asserts upon error. */ void dir_rmdir(const std::string &name); /** Wrapper for stat(). Asserts upon error. */ FileStat file_stat(const std::string &name); /** Read the entire contents of a file into a vector. The file * is read in binary mode. Errors trigger an assertion failure. */ std::vector read_entire_file(const std::string &pathname); /** Create or replace the contents of a file with a given pointer-and-length * of memory. If the file doesn't exist, it is created; if it does exist, it * is completely overwritten. Any error triggers an assertion failure. */ void write_entire_file(const std::string &pathname, const void *source, size_t source_len); inline void write_entire_file(const std::string &pathname, const std::vector &source) { write_entire_file(pathname, source.data(), source.size()); } /** A simple utility class that creates a temporary file in its ctor and * deletes that file in its dtor; this is useful for temporary files that you * want to ensure are deleted when exiting a certain scope. Since this is essentially * just an RAII wrapper around file_make_temp() and file_unlink(), it has the same * failure modes (i.e.: assertion upon error). */ class TemporaryFile final { public: TemporaryFile(const std::string &prefix, const std::string &suffix) : temp_path(file_make_temp(prefix, suffix)), do_unlink(true) { } const std::string &pathname() const { return temp_path; } ~TemporaryFile() { if (do_unlink) { file_unlink(temp_path); } } // You can call this if you want to defeat the automatic deletion; // this is rarely what you want to do (since it defeats the purpose // of this class), but can be quite handy for debugging purposes. void detach() { do_unlink = false; } private: const std::string temp_path; bool do_unlink; TemporaryFile(const TemporaryFile &) = delete; void operator=(const TemporaryFile &) = delete; }; /** Routines to test if math would overflow for signed integers with * the given number of bits. */ // @{ bool add_would_overflow(int bits, int64_t a, int64_t b); bool sub_would_overflow(int bits, int64_t a, int64_t b); bool mul_would_overflow(int bits, int64_t a, int64_t b); // @} /** Helper class for saving/restoring variable values on the stack, to allow * for early-exit that preserves correctness */ template struct ScopedValue { T &var; T old_value; /** Preserve the old value, restored at dtor time */ ScopedValue(T &var) : var(var), old_value(var) { } /** Preserve the old value, then set the var to a new value. */ ScopedValue(T &var, T new_value) : var(var), old_value(var) { var = new_value; } ~ScopedValue() { var = old_value; } operator T() const { return old_value; } // allow move but not copy ScopedValue(const ScopedValue &that) = delete; ScopedValue(ScopedValue &&that) noexcept = default; }; // Wrappers for some C++14-isms that are useful and trivially implementable // in C++11; these are defined in the Halide::Internal namespace. If we // are compiling under C++14 or later, we just use the standard implementations // rather than our own. #if __cplusplus >= 201402L // C++14: Use the standard implementations using std::index_sequence; using std::integer_sequence; using std::make_index_sequence; using std::make_integer_sequence; #else // C++11: std::integer_sequence (etc) is standard in C++14 but not C++11, but // is easily written in C++11. This is a simple version that could // probably be improved. template struct integer_sequence { static constexpr size_t size() { return sizeof...(Ints); } }; template struct next_integer_sequence; template struct next_integer_sequence> { using type = integer_sequence; }; template struct make_integer_sequence_helper { using type = typename next_integer_sequence< typename make_integer_sequence_helper::type>::type; }; template struct make_integer_sequence_helper { using type = integer_sequence; }; template using make_integer_sequence = typename make_integer_sequence_helper::type; template using index_sequence = integer_sequence; template using make_index_sequence = make_integer_sequence; #endif // Helpers for timing blocks of code. Put 'TIC;' at the start and // 'TOC;' at the end. Timing is reported at the toc via // debug(0). The calls can be nested and will pretty-print // appropriately. Took this idea from matlab via Jon Barron. // // Note that this uses global state internally, and is not thread-safe // at all. Only use it for single-threaded debugging sessions. void halide_tic_impl(const char *file, int line); void halide_toc_impl(const char *file, int line); #define HALIDE_TIC Halide::Internal::halide_tic_impl(__FILE__, __LINE__) #define HALIDE_TOC Halide::Internal::halide_toc_impl(__FILE__, __LINE__) #ifdef COMPILING_HALIDE #define TIC HALIDE_TIC #define TOC HALIDE_TOC #endif // statically cast a value from one type to another: this is really just // some syntactic sugar around static_cast<>() to avoid compiler warnings // regarding 'bool' in some compliation configurations. template struct StaticCast { template::value>::type * = nullptr> inline constexpr static TO2 value(const FROM &from) { return static_cast(from); } template::value>::type * = nullptr> inline constexpr static TO2 value(const FROM &from) { return from != 0; } }; // Like std::is_convertible, but with additional tests for arithmetic types: // ensure that the value will roundtrip losslessly (e.g., no integer truncation // or dropping of fractional parts). template struct IsRoundtrippable { template::value>::type * = nullptr> inline constexpr static bool value(const FROM &from) { return false; } template::value && std::is_arithmetic::value && std::is_arithmetic::value && !std::is_same::value>::type * = nullptr> inline constexpr static bool value(const FROM &from) { return StaticCast::value(StaticCast::value(from)) == from; } template::value && !(std::is_arithmetic::value && std::is_arithmetic::value && !std::is_same::value)>::type * = nullptr> inline constexpr static bool value(const FROM &from) { return true; } }; /** Emit a version of a string that is a valid identifier in C (. is replaced with _) */ std::string c_print_name(const std::string &name); } // namespace Internal } // namespace Halide #endif #include /** \file * Defines halide types */ /** A set of types to represent a C++ function signature. This allows * two things. First, proper prototypes can be provided for Halide * generated functions, giving better compile time type * checking. Second, C++ name mangling can be done to provide link * time type checking for both Halide generated functions and calls * from Halide to external functions. * * These are intended to be constexpr producable, but we don't depend * on C++11 yet. In C++14, it is possible these will be replaced with * introspection/reflection facilities. * * halide_handle_traits has to go outside the Halide namespace due to template * resolution rules. TODO(zalman): Do all types need to be in global namespace? */ //@{ /** A structure to represent the (unscoped) name of a C++ composite type for use * as a single argument (or return value) in a function signature. * * Currently does not support the restrict qualifier, references, or * r-value references. These features cannot be used in extern * function calls from Halide or in the generated function from * Halide, but their applicability seems limited anyway. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it. */ struct halide_cplusplus_type_name { /// An enum to indicate whether a C++ type is non-composite, a struct, class, or union enum CPPTypeType { Simple, ///< "int" Struct, ///< "struct Foo" Class, ///< "class Foo" Union, ///< "union Foo" Enum, ///< "enum Foo" } cpp_type_type; // Note: order is reflected in map_to_name table in CPlusPlusMangle.cpp std::string name; halide_cplusplus_type_name(CPPTypeType cpp_type_type, const std::string &name) : cpp_type_type(cpp_type_type), name(name) { } bool operator==(const halide_cplusplus_type_name &rhs) const { return cpp_type_type == rhs.cpp_type_type && name == rhs.name; } bool operator!=(const halide_cplusplus_type_name &rhs) const { return !(*this == rhs); } bool operator<(const halide_cplusplus_type_name &rhs) const { return cpp_type_type < rhs.cpp_type_type || (cpp_type_type == rhs.cpp_type_type && name < rhs.name); } }; /** A structure to represent the fully scoped name of a C++ composite * type for use in generating function signatures that use that type. * * This is intended to be a constexpr usable type, but we don't depend * on C++11 yet. In C++14, it is possible this will be replaced with * introspection/reflection facilities. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it. */ struct halide_handle_cplusplus_type { halide_cplusplus_type_name inner_name; std::vector namespaces; std::vector enclosing_types; /// One set of modifiers on a type. /// The const/volatile/restrict propertises are "inside" the pointer property. enum Modifier : uint8_t { Const = 1 << 0, ///< Bitmask flag for "const" Volatile = 1 << 1, ///< Bitmask flag for "volatile" Restrict = 1 << 2, ///< Bitmask flag for "restrict" Pointer = 1 << 3, ///< Bitmask flag for a pointer "*" }; /// Qualifiers and indirections on type. 0 is innermost. std::vector cpp_type_modifiers; /// References are separate because they only occur at the outermost level. /// No modifiers are needed for references as they are not allowed to apply /// to the reference itself. (This isn't true for restrict, but that is a C++ /// extension anyway.) If modifiers are needed, the last entry in the above /// array would be the modifers for the reference. enum ReferenceType : uint8_t { NotReference = 0, LValueReference = 1, // "&" RValueReference = 2, // "&&" }; ReferenceType reference_type; halide_handle_cplusplus_type(const halide_cplusplus_type_name &inner_name, const std::vector &namespaces = {}, const std::vector &enclosing_types = {}, const std::vector &modifiers = {}, ReferenceType reference_type = NotReference) : inner_name(inner_name), namespaces(namespaces), enclosing_types(enclosing_types), cpp_type_modifiers(modifiers), reference_type(reference_type) { } template static const halide_handle_cplusplus_type make(); }; //@} /** halide_c_type_to_name is a utility class used to provide a user-extensible * way of naming Handle types. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it * directly (use the HALIDE_DECLARE_EXTERN_xxx macros instead). */ template struct halide_c_type_to_name { static constexpr bool known_type = false; static halide_cplusplus_type_name name() { return {halide_cplusplus_type_name::Simple, "void"}; } }; #define HALIDE_DECLARE_EXTERN_TYPE(TypeType, Type) \ template<> \ struct halide_c_type_to_name { \ static constexpr bool known_type = true; \ static halide_cplusplus_type_name name() { \ return {halide_cplusplus_type_name::TypeType, #Type}; \ } \ } #define HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Simple, T) #define HALIDE_DECLARE_EXTERN_STRUCT_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Struct, T) #define HALIDE_DECLARE_EXTERN_CLASS_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Class, T) #define HALIDE_DECLARE_EXTERN_UNION_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Union, T) HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(char); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(bool); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int8_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint8_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int32_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint32_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int64_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint64_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::float16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::bfloat16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(float); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(double); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_buffer_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_dimension_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_device_interface_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_filter_metadata_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_semaphore_t); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_parallel_task_t); // You can make arbitrary user-defined types be "Known" using the // macro above. This is useful for making Param<> arguments for // Generators type safe. e.g., // // struct MyFunStruct { ... }; // // ... // // HALIDE_DECLARE_EXTERN_STRUCT_TYPE(MyFunStruct); // // ... // // class MyGenerator : public Generator { // Param my_struct_ptr; // ... // }; template /*static*/ const halide_handle_cplusplus_type halide_handle_cplusplus_type::make() { constexpr bool is_ptr = std::is_pointer::value; constexpr bool is_lvalue_reference = std::is_lvalue_reference::value; constexpr bool is_rvalue_reference = std::is_rvalue_reference::value; using TBase = typename std::remove_pointer::type>::type; constexpr bool is_const = std::is_const::value; constexpr bool is_volatile = std::is_volatile::value; constexpr uint8_t modifiers = static_cast( (is_ptr ? halide_handle_cplusplus_type::Pointer : 0) | (is_const ? halide_handle_cplusplus_type::Const : 0) | (is_volatile ? halide_handle_cplusplus_type::Volatile : 0)); constexpr halide_handle_cplusplus_type::ReferenceType ref_type = (is_lvalue_reference ? halide_handle_cplusplus_type::LValueReference : is_rvalue_reference ? halide_handle_cplusplus_type::RValueReference : halide_handle_cplusplus_type::NotReference); using TNonCVBase = typename std::remove_cv::type; constexpr bool known_type = halide_c_type_to_name::known_type; static_assert(!(!known_type && !is_ptr), "Unknown types must be pointers"); halide_handle_cplusplus_type info = { halide_c_type_to_name::name(), {}, {}, {modifiers}, ref_type}; // Pull off any namespaces info.inner_name.name = Halide::Internal::extract_namespaces(info.inner_name.name, info.namespaces); return info; } /** A type traits template to provide a halide_handle_cplusplus_type * value from a C++ type. * * Note the type represented is implicitly a pointer. * * A NULL pointer of type halide_handle_traits represents "void *". * This is chosen for compactness or representation as Type is a very * widely used data structure. * * Although this is in the global namespace, it should be considered "Halide Internal" * and subject to change; code outside Halide should avoid referencing it directly. */ template struct halide_handle_traits { // This trait must return a pointer to a global structure. I.e. it should never be freed. // A return value of nullptr here means "void *". HALIDE_ALWAYS_INLINE static const halide_handle_cplusplus_type *type_info() { if (std::is_pointer::value || std::is_lvalue_reference::value || std::is_rvalue_reference::value) { static const halide_handle_cplusplus_type the_info = halide_handle_cplusplus_type::make(); return &the_info; } return nullptr; } }; namespace Halide { struct Expr; /** Types in the halide type system. They can be ints, unsigned ints, * or floats of various bit-widths (the 'bits' field). They can also * be vectors of the same (by setting the 'lanes' field to something * larger than one). Front-end code shouldn't use vector * types. Instead vectorize a function. */ struct Type { private: halide_type_t type; public: /** Aliases for halide_type_code_t values for legacy compatibility * and to match the Halide internal C++ style. */ // @{ static const halide_type_code_t Int = halide_type_int; static const halide_type_code_t UInt = halide_type_uint; static const halide_type_code_t Float = halide_type_float; static const halide_type_code_t BFloat = halide_type_bfloat; static const halide_type_code_t Handle = halide_type_handle; // @} /** The number of bytes required to store a single scalar value of this type. Ignores vector lanes. */ int bytes() const { return (bits() + 7) / 8; } // Default ctor initializes everything to predictable-but-unlikely values Type() : type(Handle, 0, 0), handle_type(nullptr) { } /** Construct a runtime representation of a Halide type from: * code: The fundamental type from an enum. * bits: The bit size of one element. * lanes: The number of vector elements in the type. */ Type(halide_type_code_t code, int bits, int lanes, const halide_handle_cplusplus_type *handle_type = nullptr) : type(code, (uint8_t)bits, (uint16_t)lanes), handle_type(handle_type) { } /** Trivial copy constructor. */ Type(const Type &that) = default; /** Trivial copy assignment operator. */ Type &operator=(const Type &that) = default; /** Type is a wrapper around halide_type_t with more methods for use * inside the compiler. This simply constructs the wrapper around * the runtime value. */ HALIDE_ALWAYS_INLINE Type(const halide_type_t &that, const halide_handle_cplusplus_type *handle_type = nullptr) : type(that), handle_type(handle_type) { } /** Unwrap the runtime halide_type_t for use in runtime calls, etc. * Representation is exactly equivalent. */ HALIDE_ALWAYS_INLINE operator halide_type_t() const { return type; } /** Return the underlying data type of an element as an enum value. */ HALIDE_ALWAYS_INLINE halide_type_code_t code() const { return (halide_type_code_t)type.code; } /** Return the bit size of a single element of this type. */ HALIDE_ALWAYS_INLINE int bits() const { return type.bits; } /** Return the number of vector elements in this type. */ HALIDE_ALWAYS_INLINE int lanes() const { return type.lanes; } /** Return Type with same number of bits and lanes, but new_code for a type code. */ Type with_code(halide_type_code_t new_code) const { return Type(new_code, bits(), lanes(), (new_code == code()) ? handle_type : nullptr); } /** Return Type with same type code and lanes, but new_bits for the number of bits. */ Type with_bits(int new_bits) const { return Type(code(), new_bits, lanes(), (new_bits == bits()) ? handle_type : nullptr); } /** Return Type with same type code and number of bits, * but new_lanes for the number of vector lanes. */ Type with_lanes(int new_lanes) const { return Type(code(), bits(), new_lanes, handle_type); } /** Type to be printed when declaring handles of this type. */ const halide_handle_cplusplus_type *handle_type; /** Is this type boolean (represented as UInt(1))? */ HALIDE_ALWAYS_INLINE bool is_bool() const { return code() == UInt && bits() == 1; } /** Is this type a vector type? (lanes() != 1). * TODO(abadams): Decide what to do for lanes() == 0. */ HALIDE_ALWAYS_INLINE bool is_vector() const { return lanes() != 1; } /** Is this type a scalar type? (lanes() == 1). * TODO(abadams): Decide what to do for lanes() == 0. */ HALIDE_ALWAYS_INLINE bool is_scalar() const { return lanes() == 1; } /** Is this type a floating point type (float or double). */ HALIDE_ALWAYS_INLINE bool is_float() const { return code() == Float || code() == BFloat; } /** Is this type a floating point type (float or double). */ HALIDE_ALWAYS_INLINE bool is_bfloat() const { return code() == BFloat; } /** Is this type a signed integer type? */ HALIDE_ALWAYS_INLINE bool is_int() const { return code() == Int; } /** Is this type an unsigned integer type? */ HALIDE_ALWAYS_INLINE bool is_uint() const { return code() == UInt; } /** Is this type an integer type of any sort? */ HALIDE_ALWAYS_INLINE bool is_int_or_uint() const { return code() == Int || code() == UInt; } /** Is this type an opaque handle type (void *) */ HALIDE_ALWAYS_INLINE bool is_handle() const { return code() == Handle; } /** Check that the type name of two handles matches. */ bool same_handle_type(const Type &other) const; /** Compare two types for equality */ bool operator==(const Type &other) const { return type == other.type && (code() != Handle || same_handle_type(other)); } /** Compare two types for inequality */ bool operator!=(const Type &other) const { return type != other.type || (code() == Handle && !same_handle_type(other)); } /** Compare ordering of two types so they can be used in certain containers and algorithms */ bool operator<(const Type &other) const { if (type < other.type) return true; if (code() == Handle) return handle_type < other.handle_type; return false; } /** Produce the scalar type (that of a single element) of this vector type */ Type element_of() const { return with_lanes(1); } /** Can this type represent all values of another type? */ bool can_represent(Type other) const; /** Can this type represent a particular constant? */ // @{ bool can_represent(double x) const; bool can_represent(int64_t x) const; bool can_represent(uint64_t x) const; // @} /** Check if an integer constant value is the maximum or minimum * representable value for this type. */ // @{ bool is_max(uint64_t) const; bool is_max(int64_t) const; bool is_min(uint64_t) const; bool is_min(int64_t) const; // @} /** Return an expression which is the maximum value of this type. * Returns infinity for types which can represent it. */ Expr max() const; /** Return an expression which is the minimum value of this type. * Returns -infinity for types which can represent it. */ Expr min() const; }; /** Constructing a signed integer type */ inline Type Int(int bits, int lanes = 1) { return Type(Type::Int, bits, lanes); } /** Constructing an unsigned integer type */ inline Type UInt(int bits, int lanes = 1) { return Type(Type::UInt, bits, lanes); } /** Construct a floating-point type */ inline Type Float(int bits, int lanes = 1) { return Type(Type::Float, bits, lanes); } /** Construct a floating-point type in the bfloat format. Only 16-bit currently supported. */ inline Type BFloat(int bits, int lanes = 1) { return Type(Type::BFloat, bits, lanes); } /** Construct a boolean type */ inline Type Bool(int lanes = 1) { return UInt(1, lanes); } /** Construct a handle type */ inline Type Handle(int lanes = 1, const halide_handle_cplusplus_type *handle_type = nullptr) { return Type(Type::Handle, 64, lanes, handle_type); } /** Construct the halide equivalent of a C type */ template inline Type type_of() { return Type(halide_type_of(), halide_handle_traits::type_info()); } /** Halide type to a C++ type */ std::string type_to_c_type(Type type, bool include_space, bool c_plus_plus = true); } // namespace Halide #endif namespace Halide { struct bfloat16_t; struct float16_t; namespace Internal { class IRMutator; class IRVisitor; /** All our IR node types get unique IDs for the purposes of RTTI */ enum class IRNodeType { // Exprs, in order of strength IntImm, UIntImm, FloatImm, StringImm, Broadcast, Cast, Variable, Add, Sub, Mod, Mul, Div, Min, Max, EQ, NE, LT, LE, GT, GE, And, Or, Not, Select, Load, Ramp, Call, Let, Shuffle, // Stmts LetStmt, AssertStmt, ProducerConsumer, For, Acquire, Store, Provide, Allocate, Free, Realize, Block, Fork, IfThenElse, Evaluate, Prefetch, Atomic }; /** The abstract base classes for a node in the Halide IR. */ struct IRNode { /** We use the visitor pattern to traverse IR nodes throughout the * compiler, so we have a virtual accept method which accepts * visitors. */ virtual void accept(IRVisitor *v) const = 0; IRNode(IRNodeType t) : node_type(t) { } virtual ~IRNode() = default; /** These classes are all managed with intrusive reference * counting, so we also track a reference count. It's mutable * so that we can do reference counting even through const * references to IR nodes. */ mutable RefCount ref_count; /** Each IR node subclass has a unique identifier. We can compare * these values to do runtime type identification. We don't * compile with rtti because that injects run-time type * identification stuff everywhere (and often breaks when linking * external libraries compiled without it), and we only want it * for IR nodes. One might want to put this value in the vtable, * but that adds another level of indirection, and for Exprs we * have 32 free bits in between the ref count and the Type * anyway, so this doesn't increase the memory footprint of an IR node. */ IRNodeType node_type; }; template<> inline RefCount &ref_count(const IRNode *t) noexcept { return t->ref_count; } template<> inline void destroy(const IRNode *t) { delete t; } /** IR nodes are split into expressions and statements. These are similar to expressions and statements in C - expressions represent some value and have some type (e.g. x + 3), and statements are side-effecting pieces of code that do not represent a value (e.g. assert(x > 3)) */ /** A base class for statement nodes. They have no properties or methods beyond base IR nodes for now. */ struct BaseStmtNode : public IRNode { BaseStmtNode(IRNodeType t) : IRNode(t) { } virtual Stmt mutate_stmt(IRMutator *v) const = 0; }; /** A base class for expression nodes. They all contain their types * (e.g. Int(32), Float(32)) */ struct BaseExprNode : public IRNode { BaseExprNode(IRNodeType t) : IRNode(t) { } virtual Expr mutate_expr(IRMutator *v) const = 0; Type type; }; /** We use the "curiously recurring template pattern" to avoid duplicated code in the IR Nodes. These classes live between the abstract base classes and the actual IR Nodes in the inheritance hierarchy. It provides an implementation of the accept function necessary for the visitor pattern to work, and a concrete instantiation of a unique IRNodeType per class. */ template struct ExprNode : public BaseExprNode { void accept(IRVisitor *v) const override; Expr mutate_expr(IRMutator *v) const override; ExprNode() : BaseExprNode(T::_node_type) { } ~ExprNode() override = default; }; template struct StmtNode : public BaseStmtNode { void accept(IRVisitor *v) const override; Stmt mutate_stmt(IRMutator *v) const override; StmtNode() : BaseStmtNode(T::_node_type) { } ~StmtNode() override = default; }; /** IR nodes are passed around opaque handles to them. This is a base class for those handles. It manages the reference count, and dispatches visitors. */ struct IRHandle : public IntrusivePtr { HALIDE_ALWAYS_INLINE IRHandle() = default; HALIDE_ALWAYS_INLINE IRHandle(const IRNode *p) : IntrusivePtr(p) { } /** Dispatch to the correct visitor method for this node. E.g. if * this node is actually an Add node, then this will call * IRVisitor::visit(const Add *) */ void accept(IRVisitor *v) const { ptr->accept(v); } /** Downcast this ir node to its actual type (e.g. Add, or * Select). This returns nullptr if the node is not of the requested * type. Example usage: * * if (const Add *add = node->as()) { * // This is an add node * } */ template const T *as() const { if (ptr && ptr->node_type == T::_node_type) { return (const T *)ptr; } return nullptr; } IRNodeType node_type() const { return ptr->node_type; } }; /** Integer constants */ struct IntImm : public ExprNode { int64_t value; static const IntImm *make(Type t, int64_t value); static const IRNodeType _node_type = IRNodeType::IntImm; }; /** Unsigned integer constants */ struct UIntImm : public ExprNode { uint64_t value; static const UIntImm *make(Type t, uint64_t value); static const IRNodeType _node_type = IRNodeType::UIntImm; }; /** Floating point constants */ struct FloatImm : public ExprNode { double value; static const FloatImm *make(Type t, double value); static const IRNodeType _node_type = IRNodeType::FloatImm; }; /** String constants */ struct StringImm : public ExprNode { std::string value; static const StringImm *make(const std::string &val); static const IRNodeType _node_type = IRNodeType::StringImm; }; } // namespace Internal /** A fragment of Halide syntax. It's implemented as reference-counted * handle to a concrete expression node, but it's immutable, so you * can treat it as a value type. */ struct Expr : public Internal::IRHandle { /** Make an undefined expression */ HALIDE_ALWAYS_INLINE Expr() = default; /** Make an expression from a concrete expression node pointer (e.g. Add) */ HALIDE_ALWAYS_INLINE Expr(const Internal::BaseExprNode *n) : IRHandle(n) { } /** Make an expression representing numeric constants of various types. */ // @{ explicit Expr(int8_t x) : IRHandle(Internal::IntImm::make(Int(8), x)) { } explicit Expr(int16_t x) : IRHandle(Internal::IntImm::make(Int(16), x)) { } Expr(int32_t x) : IRHandle(Internal::IntImm::make(Int(32), x)) { } explicit Expr(int64_t x) : IRHandle(Internal::IntImm::make(Int(64), x)) { } explicit Expr(uint8_t x) : IRHandle(Internal::UIntImm::make(UInt(8), x)) { } explicit Expr(uint16_t x) : IRHandle(Internal::UIntImm::make(UInt(16), x)) { } explicit Expr(uint32_t x) : IRHandle(Internal::UIntImm::make(UInt(32), x)) { } explicit Expr(uint64_t x) : IRHandle(Internal::UIntImm::make(UInt(64), x)) { } Expr(float16_t x) : IRHandle(Internal::FloatImm::make(Float(16), (double)x)) { } Expr(bfloat16_t x) : IRHandle(Internal::FloatImm::make(BFloat(16), (double)x)) { } Expr(float x) : IRHandle(Internal::FloatImm::make(Float(32), x)) { } explicit Expr(double x) : IRHandle(Internal::FloatImm::make(Float(64), x)) { } // @} /** Make an expression representing a const string (i.e. a StringImm) */ Expr(const std::string &s) : IRHandle(Internal::StringImm::make(s)) { } /** Override get() to return a BaseExprNode * instead of an IRNode * */ HALIDE_ALWAYS_INLINE const Internal::BaseExprNode *get() const { return (const Internal::BaseExprNode *)ptr; } /** Get the type of this expression node */ HALIDE_ALWAYS_INLINE Type type() const { return get()->type; } }; /** This lets you use an Expr as a key in a map of the form * map */ struct ExprCompare { bool operator()(const Expr &a, const Expr &b) const { return a.get() < b.get(); } }; /** A single-dimensional span. Includes all numbers between min and * (min + extent - 1). */ struct Range { Expr min, extent; Range() = default; Range(const Expr &min_in, const Expr &extent_in); }; /** A multi-dimensional box. The outer product of the elements */ typedef std::vector Region; /** An enum describing different address spaces to be used with Func::store_in. */ enum class MemoryType { /** Let Halide select a storage type automatically */ Auto, /** Heap/global memory. Allocated using halide_malloc, or * halide_device_malloc */ Heap, /** Stack memory. Allocated using alloca. Requires a constant * size. Corresponds to per-thread local memory on the GPU. If all * accesses are at constant coordinates, may be promoted into the * register file at the discretion of the register allocator. */ Stack, /** Register memory. The allocation should be promoted into the * register file. All stores must be at constant coordinates. May * be spilled to the stack at the discretion of the register * allocator. */ Register, /** Allocation is stored in GPU shared memory. Also known as * "local" in OpenCL, and "threadgroup" in metal. Can be shared * across GPU threads within the same block. */ GPUShared, /** Allocate Locked Cache Memory to act as local memory */ LockedCache, /** Vector Tightly Coupled Memory. HVX (Hexagon) local memory available on * v65+. This memory has higher performance and lower power. Ideal for * intermediate buffers. Necessary for vgather-vscatter instructions * on Hexagon */ VTCM, }; namespace Internal { /** An enum describing a type of loop traversal. Used in schedules, * and in the For loop IR node. Serial is a conventional ordered for * loop. Iterations occur in increasing order, and each iteration must * appear to have finished before the next begins. Parallel, GPUBlock, * and GPUThread are parallel and unordered: iterations may occur in * any order, and multiple iterations may occur * simultaneously. Vectorized and GPULane are parallel and * synchronous: they act as if all iterations occur at the same time * in lockstep. */ enum class ForType { Serial, Parallel, Vectorized, Unrolled, Extern, GPUBlock, GPUThread, GPULane, }; /** Check if for_type executes for loop iterations in parallel and unordered. */ bool is_unordered_parallel(ForType for_type); /** Returns true if for_type executes for loop iterations in parallel. */ bool is_parallel(ForType for_type); /** A reference-counted handle to a statement node. */ struct Stmt : public IRHandle { Stmt() = default; Stmt(const BaseStmtNode *n) : IRHandle(n) { } /** Override get() to return a BaseStmtNode * instead of an IRNode * */ HALIDE_ALWAYS_INLINE const BaseStmtNode *get() const { return (const Internal::BaseStmtNode *)ptr; } /** This lets you use a Stmt as a key in a map of the form * map */ struct Compare { bool operator()(const Stmt &a, const Stmt &b) const { return a.ptr < b.ptr; } }; }; } // namespace Internal } // namespace Halide #endif #include /** \file * Defines the lowering pass that insert mutex allocation code & locks * for the atomic nodes that require mutex locks. It also checks whether * the atomic operation is valid. It rejects algorithms that have indexing * on left-hand-side which references the buffer itself, e.g. * f(clamp(f(r), 0, 100)) = f(r) + 1 * If the SplitTuple pass does not lift out the Provide value as a let * expression. This is confirmed by checking whether the Provide nodes * inside an Atomic node have let binding values accessing the buffers * inside the atomic node. * Finally, it lifts the store indexing expressions inside the atomic node * outside of the atomic to avoid side-effects inside those expressions * being evaluated twice. */ namespace Halide { namespace Internal { class Function; Stmt add_atomic_mutex(Stmt s, const std::map &env); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_INTERNAL_ADD_IMAGE_CHECKS_H #define HALIDE_INTERNAL_ADD_IMAGE_CHECKS_H /** \file * * Defines the lowering pass that adds the assertions that validate * input and output buffers. */ #include #include #include #ifndef HALIDE_BOUNDS_H #define HALIDE_BOUNDS_H /** \file * Methods for computing the upper and lower bounds of an expression, * and the regions of a function read or written by a statement. */ #ifndef HALIDE_INTERVAL_H #define HALIDE_INTERVAL_H /** \file * Defines the Interval class */ namespace Halide { namespace Internal { /** A class to represent ranges of Exprs. Can be unbounded above or below. */ struct Interval { /** Exprs to represent positive and negative infinity */ #ifdef COMPILING_HALIDE static HALIDE_ALWAYS_INLINE Expr pos_inf() { return pos_inf_expr; } static HALIDE_ALWAYS_INLINE Expr neg_inf() { return neg_inf_expr; } #else static Expr pos_inf() { return pos_inf_noinline(); } static Expr neg_inf() { return neg_inf_noinline(); } #endif /** The lower and upper bound of the interval. They are included * in the interval. */ Expr min, max; /** A default-constructed Interval is everything */ Interval() : min(neg_inf()), max(pos_inf()) { } /** Construct an interval from a lower and upper bound. */ Interval(const Expr &min, const Expr &max) : min(min), max(max) { internal_assert(min.defined() && max.defined()); } /** The interval representing everything. */ static Interval everything(); /** The interval representing nothing. */ static Interval nothing(); /** Construct an interval representing a single point */ static Interval single_point(const Expr &e); /** Is the interval the empty set */ bool is_empty() const; /** Is the interval the entire range */ bool is_everything() const; /** Is the interval just a single value (min == max) */ bool is_single_point() const; /** Is the interval a particular single value */ bool is_single_point(const Expr &e) const; /** Does the interval have a finite least upper bound */ bool has_upper_bound() const; /** Does the interval have a finite greatest lower bound */ bool has_lower_bound() const; /** Does the interval have a finite upper and lower bound */ bool is_bounded() const; /** Is the interval the same as another interval */ bool same_as(const Interval &other) const; /** Expand the interval to include another Interval */ void include(const Interval &i); /** Expand the interval to include an Expr */ void include(const Expr &e); /** Construct the smallest interval containing two intervals. */ static Interval make_union(const Interval &a, const Interval &b); /** Construct the largest interval contained within two intervals. */ static Interval make_intersection(const Interval &a, const Interval &b); /** An eagerly-simplifying max of two Exprs that respects infinities. */ static Expr make_max(const Expr &a, const Expr &b); /** An eagerly-simplifying min of two Exprs that respects infinities. */ static Expr make_min(const Expr &a, const Expr &b); /** Equivalent to same_as. Exists so that the autoscheduler can * compare two map for equality in order to * cache computations. */ bool operator==(const Interval &other) const; private: static Expr neg_inf_expr, pos_inf_expr; // Never used inside libHalide; provided for Halide tests, to avoid needing to export // data fields in some build environments. static Expr pos_inf_noinline(); static Expr neg_inf_noinline(); }; } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_SCOPE_H #define HALIDE_SCOPE_H #include #include #include #include #include #include /** \file * Defines the Scope class, which is used for keeping track of names in a scope while traversing IR */ namespace Halide { namespace Internal { /** A stack which can store one item very efficiently. Using this * instead of std::stack speeds up Scope substantially. */ template class SmallStack { private: T _top; std::vector _rest; bool _empty = true; public: SmallStack() = default; void pop() { if (_rest.empty()) { _empty = true; _top = T(); } else { _top = std::move(_rest.back()); _rest.pop_back(); } } void push(T t) { if (!_empty) { _rest.push_back(std::move(_top)); } _top = std::move(t); _empty = false; } T top() const { return _top; } T &top_ref() { return _top; } const T &top_ref() const { return _top; } bool empty() const { return _empty; } size_t size() const { return _empty ? 0 : (_rest.size() + 1); } }; template<> class SmallStack { // A stack of voids. Voids are all the same, so just record how many voids are in the stack int counter = 0; public: void pop() { counter--; } void push() { counter++; } bool empty() const { return counter == 0; } }; /** A common pattern when traversing Halide IR is that you need to * keep track of stuff when you find a Let or a LetStmt, and that it * should hide previous values with the same name until you leave the * Let or LetStmt nodes This class helps with that. */ template class Scope { private: std::map> table; const Scope *containing_scope = nullptr; public: Scope() = default; Scope(Scope &&that) noexcept = default; Scope &operator=(Scope &&that) noexcept = default; // Copying a scope object copies a large table full of strings and // stacks. Bad idea. Scope(const Scope &) = delete; Scope &operator=(const Scope &) = delete; /** Set the parent scope. If lookups fail in this scope, they * check the containing scope before returning an error. Caller is * responsible for managing the memory of the containing scope. */ void set_containing_scope(const Scope *s) { containing_scope = s; } /** A const ref to an empty scope. Useful for default function * arguments, which would otherwise require a copy constructor * (with llvm in c++98 mode) */ static const Scope &empty_scope() { static Scope _empty_scope; return _empty_scope; } /** Retrieve the value referred to by a name */ template::value>::type> T2 get(const std::string &name) const { typename std::map>::const_iterator iter = table.find(name); if (iter == table.end() || iter->second.empty()) { if (containing_scope) { return containing_scope->get(name); } else { internal_error << "Name not in Scope: " << name << "\n" << *this << "\n"; } } return iter->second.top(); } /** Return a reference to an entry. Does not consider the containing scope. */ template::value>::type> T2 &ref(const std::string &name) { typename std::map>::iterator iter = table.find(name); if (iter == table.end() || iter->second.empty()) { internal_error << "Name not in Scope: " << name << "\n" << *this << "\n"; } return iter->second.top_ref(); } /** Tests if a name is in scope */ bool contains(const std::string &name) const { typename std::map>::const_iterator iter = table.find(name); if (iter == table.end() || iter->second.empty()) { if (containing_scope) { return containing_scope->contains(name); } else { return false; } } return true; } /** How many nested definitions of a single name exist? */ size_t count(const std::string &name) const { auto it = table.find(name); if (it == table.end()) { return 0; } else { return it->second.size(); } } /** Add a new (name, value) pair to the current scope. Hide old * values that have this name until we pop this name. */ template::value>::type> void push(const std::string &name, T2 &&value) { table[name].push(std::forward(value)); } template::value>::type> void push(const std::string &name) { table[name].push(); } /** A name goes out of scope. Restore whatever its old value * was (or remove it entirely if there was nothing else of the * same name in an outer scope) */ void pop(const std::string &name) { typename std::map>::iterator iter = table.find(name); internal_assert(iter != table.end()) << "Name not in Scope: " << name << "\n" << *this << "\n"; iter->second.pop(); if (iter->second.empty()) { table.erase(iter); } } /** Iterate through the scope. Does not capture any containing scope. */ class const_iterator { typename std::map>::const_iterator iter; public: explicit const_iterator(const typename std::map>::const_iterator &i) : iter(i) { } const_iterator() { } bool operator!=(const const_iterator &other) { return iter != other.iter; } void operator++() { ++iter; } const std::string &name() { return iter->first; } const SmallStack &stack() { return iter->second; } template::value>::type> const T2 &value() { return iter->second.top_ref(); } }; const_iterator cbegin() const { return const_iterator(table.begin()); } const_iterator cend() const { return const_iterator(table.end()); } void swap(Scope &other) { table.swap(other.table); std::swap(containing_scope, other.containing_scope); } }; template std::ostream &operator<<(std::ostream &stream, const Scope &s) { stream << "{\n"; typename Scope::const_iterator iter; for (iter = s.cbegin(); iter != s.cend(); ++iter) { stream << " " << iter.name() << "\n"; } stream << "}"; return stream; } /** Helper class for pushing/popping Scope<> values, to allow * for early-exit in Visitor/Mutators that preserves correctness. * Note that this name can be a bit confusing, since there are two "scopes" * involved here: * - the Scope object itself * - the lifetime of this helper object * The "Scoped" in this class name refers to the latter, as it temporarily binds * a name within the scope of this helper's lifetime. */ template struct ScopedBinding { Scope *scope = nullptr; std::string name; ScopedBinding() = default; ScopedBinding(Scope &s, const std::string &n, T value) : scope(&s), name(n) { scope->push(name, std::move(value)); } ScopedBinding(bool condition, Scope &s, const std::string &n, const T &value) : scope(condition ? &s : nullptr), name(n) { if (condition) { scope->push(name, value); } } bool bound() const { return scope != nullptr; } ~ScopedBinding() { if (scope) { scope->pop(name); } } // allow move but not copy ScopedBinding(const ScopedBinding &that) = delete; ScopedBinding(ScopedBinding &&that) noexcept : scope(that.scope), name(std::move(that.name)) { // The move constructor must null out scope, so we don't try to pop it that.scope = nullptr; } void operator=(const ScopedBinding &that) = delete; void operator=(ScopedBinding &&that) = delete; }; template<> struct ScopedBinding { Scope<> *scope; std::string name; ScopedBinding(Scope<> &s, const std::string &n) : scope(&s), name(n) { scope->push(name); } ScopedBinding(bool condition, Scope<> &s, const std::string &n) : scope(condition ? &s : nullptr), name(n) { if (condition) { scope->push(name); } } ~ScopedBinding() { if (scope) { scope->pop(name); } } // allow move but not copy ScopedBinding(const ScopedBinding &that) = delete; ScopedBinding(ScopedBinding &&that) noexcept : scope(that.scope), name(std::move(that.name)) { // The move constructor must null out scope, so we don't try to pop it that.scope = nullptr; } void operator=(const ScopedBinding &that) = delete; void operator=(ScopedBinding &&that) = delete; }; } // namespace Internal } // namespace Halide #endif namespace Halide { namespace Internal { class Function; typedef std::map, Interval> FuncValueBounds; const FuncValueBounds &empty_func_value_bounds(); /** Given an expression in some variables, and a map from those * variables to their bounds (in the form of (minimum possible value, * maximum possible value)), compute two expressions that give the * minimum possible value and the maximum possible value of this * expression. Max or min may be undefined expressions if the value is * not bounded above or below. If the expression is a vector, also * takes the bounds across the vector lanes and returns a scalar * result. * * This is for tasks such as deducing the region of a buffer * loaded by a chunk of code. */ Interval bounds_of_expr_in_scope(const Expr &expr, const Scope &scope, const FuncValueBounds &func_bounds = empty_func_value_bounds(), bool const_bound = false); /** Given a varying expression, try to find a constant that is either: * An upper bound (always greater than or equal to the expression), or * A lower bound (always less than or equal to the expression) * If it fails, returns an undefined Expr. */ enum class Direction { Upper, Lower }; Expr find_constant_bound(const Expr &e, Direction d, const Scope &scope = Scope::empty_scope()); /** Find bounds for a varying expression that are either constants or * +/-inf. */ Interval find_constant_bounds(const Expr &e, const Scope &scope); /** Represents the bounds of a region of arbitrary dimension. Zero * dimensions corresponds to a scalar region. */ struct Box { /** The conditions under which this region may be touched. */ Expr used; /** The bounds if it is touched. */ std::vector bounds; Box() = default; explicit Box(size_t sz) : bounds(sz) { } explicit Box(const std::vector &b) : bounds(b) { } size_t size() const { return bounds.size(); } bool empty() const { return bounds.empty(); } Interval &operator[](size_t i) { return bounds[i]; } const Interval &operator[](size_t i) const { return bounds[i]; } void resize(size_t sz) { bounds.resize(sz); } void push_back(const Interval &i) { bounds.push_back(i); } /** Check if the used condition is defined and not trivially true. */ bool maybe_unused() const; friend std::ostream &operator<<(std::ostream &stream, const Box &b); }; /** Expand box a to encompass box b */ void merge_boxes(Box &a, const Box &b); /** Test if box a could possibly overlap box b. */ bool boxes_overlap(const Box &a, const Box &b); /** The union of two boxes */ Box box_union(const Box &a, const Box &b); /** The intersection of two boxes */ Box box_intersection(const Box &a, const Box &b); /** Test if box a provably contains box b */ bool box_contains(const Box &a, const Box &b); /** Compute rectangular domains large enough to cover all the 'Call's * to each function that occurs within a given statement or * expression. This is useful for figuring out what regions of things * to evaluate. */ // @{ std::map boxes_required(const Expr &e, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); std::map boxes_required(Stmt s, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); // @} /** Compute rectangular domains large enough to cover all the * 'Provides's to each function that occurs within a given statement * or expression. */ // @{ std::map boxes_provided(const Expr &e, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); std::map boxes_provided(Stmt s, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); // @} /** Compute rectangular domains large enough to cover all the 'Call's * and 'Provides's to each function that occurs within a given * statement or expression. */ // @{ std::map boxes_touched(const Expr &e, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); std::map boxes_touched(Stmt s, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); // @} /** Variants of the above that are only concerned with a single function. */ // @{ Box box_required(const Expr &e, const std::string &fn, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); Box box_required(Stmt s, const std::string &fn, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); Box box_provided(const Expr &e, const std::string &fn, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); Box box_provided(Stmt s, const std::string &fn, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); Box box_touched(const Expr &e, const std::string &fn, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); Box box_touched(Stmt s, const std::string &fn, const Scope &scope = Scope::empty_scope(), const FuncValueBounds &func_bounds = empty_func_value_bounds()); // @} /** Compute the maximum and minimum possible value for each function * in an environment. */ FuncValueBounds compute_function_value_bounds(const std::vector &order, const std::map &env); void bounds_test(); } // namespace Internal } // namespace Halide #endif #include namespace Halide { struct Target; namespace Internal { class Function; /** Insert checks to make sure a statement doesn't read out of bounds * on inputs or outputs, and that the inputs and outputs conform to * the format required (e.g. stride.0 must be 1). */ Stmt add_image_checks(Stmt s, const std::vector &outputs, const Target &t, const std::vector &order, const std::map &env, const FuncValueBounds &fb, bool will_inject_host_copies); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_INTERNAL_ADD_PARAMETER_CHECKS_H #define HALIDE_INTERNAL_ADD_PARAMETER_CHECKS_H /** \file * * Defines the lowering pass that adds the assertions that validate * scalar parameters. */ #include #ifndef HALIDE_TARGET_H #define HALIDE_TARGET_H /** \file * Defines the structure that describes a Halide target. */ #include #include #include #ifndef HALIDE_DEVICEAPI_H #define HALIDE_DEVICEAPI_H /** \file * Defines DeviceAPI. */ #include #include namespace Halide { /** An enum describing a type of device API. Used by schedules, and in * the For loop IR node. */ enum class DeviceAPI { None, /// Used to denote for loops that run on the same device as the containing code. Host, Default_GPU, CUDA, OpenCL, GLSL, OpenGLCompute, Metal, Hexagon, HexagonDma, D3D12Compute, }; /** An array containing all the device apis. Useful for iterating * through them. */ const DeviceAPI all_device_apis[] = {DeviceAPI::None, DeviceAPI::Host, DeviceAPI::Default_GPU, DeviceAPI::CUDA, DeviceAPI::OpenCL, DeviceAPI::GLSL, DeviceAPI::OpenGLCompute, DeviceAPI::Metal, DeviceAPI::Hexagon, DeviceAPI::HexagonDma, DeviceAPI::D3D12Compute}; } // namespace Halide #endif // HALIDE_DEVICEAPI_H namespace Halide { /** A struct representing a target machine and os to generate code for. */ struct Target { /** The operating system used by the target. Determines which * system calls to generate. * Corresponds to os_name_map in Target.cpp. */ enum OS { OSUnknown = 0, Linux, Windows, OSX, Android, IOS, QuRT, NoOS, Fuchsia, WebAssemblyRuntime } os; /** The architecture used by the target. Determines the * instruction set to use. * Corresponds to arch_name_map in Target.cpp. */ enum Arch { ArchUnknown = 0, X86, ARM, MIPS, Hexagon, POWERPC, WebAssembly, RISCV } arch; /** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */ int bits; /** Optional features a target can have. * Corresponds to feature_name_map in Target.cpp. * See definitions in HalideRuntime.h for full information. */ enum Feature { JIT = halide_target_feature_jit, Debug = halide_target_feature_debug, NoAsserts = halide_target_feature_no_asserts, NoBoundsQuery = halide_target_feature_no_bounds_query, SSE41 = halide_target_feature_sse41, AVX = halide_target_feature_avx, AVX2 = halide_target_feature_avx2, FMA = halide_target_feature_fma, FMA4 = halide_target_feature_fma4, F16C = halide_target_feature_f16c, ARMv7s = halide_target_feature_armv7s, NoNEON = halide_target_feature_no_neon, VSX = halide_target_feature_vsx, POWER_ARCH_2_07 = halide_target_feature_power_arch_2_07, CUDA = halide_target_feature_cuda, CUDACapability30 = halide_target_feature_cuda_capability30, CUDACapability32 = halide_target_feature_cuda_capability32, CUDACapability35 = halide_target_feature_cuda_capability35, CUDACapability50 = halide_target_feature_cuda_capability50, CUDACapability61 = halide_target_feature_cuda_capability61, OpenCL = halide_target_feature_opencl, CLDoubles = halide_target_feature_cl_doubles, CLHalf = halide_target_feature_cl_half, CLAtomics64 = halide_target_feature_cl_atomic64, OpenGL = halide_target_feature_opengl, OpenGLCompute = halide_target_feature_openglcompute, EGL = halide_target_feature_egl, UserContext = halide_target_feature_user_context, Matlab = halide_target_feature_matlab, Profile = halide_target_feature_profile, NoRuntime = halide_target_feature_no_runtime, Metal = halide_target_feature_metal, CPlusPlusMangling = halide_target_feature_c_plus_plus_mangling, LargeBuffers = halide_target_feature_large_buffers, HexagonDma = halide_target_feature_hexagon_dma, HVX_64 = halide_target_feature_hvx_64, HVX_128 = halide_target_feature_hvx_128, HVX_v62 = halide_target_feature_hvx_v62, HVX_v65 = halide_target_feature_hvx_v65, HVX_v66 = halide_target_feature_hvx_v66, HVX_shared_object = halide_target_feature_hvx_use_shared_object, FuzzFloatStores = halide_target_feature_fuzz_float_stores, SoftFloatABI = halide_target_feature_soft_float_abi, MSAN = halide_target_feature_msan, AVX512 = halide_target_feature_avx512, AVX512_KNL = halide_target_feature_avx512_knl, AVX512_Skylake = halide_target_feature_avx512_skylake, AVX512_Cannonlake = halide_target_feature_avx512_cannonlake, TraceLoads = halide_target_feature_trace_loads, TraceStores = halide_target_feature_trace_stores, TraceRealizations = halide_target_feature_trace_realizations, TracePipeline = halide_target_feature_trace_pipeline, D3D12Compute = halide_target_feature_d3d12compute, StrictFloat = halide_target_feature_strict_float, TSAN = halide_target_feature_tsan, ASAN = halide_target_feature_asan, CheckUnsafePromises = halide_target_feature_check_unsafe_promises, EmbedBitcode = halide_target_feature_embed_bitcode, EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt, DisableLLVMLoopOpt = halide_target_feature_disable_llvm_loop_opt, WasmSimd128 = halide_target_feature_wasm_simd128, WasmSignExt = halide_target_feature_wasm_signext, SVE = halide_target_feature_sve, SVE2 = halide_target_feature_sve2, FeatureEnd = halide_target_feature_end }; Target() : os(OSUnknown), arch(ArchUnknown), bits(0) { } Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) : os(o), arch(a), bits(b) { for (const auto &f : initial_features) { set_feature(f); } } /** Given a string of the form used in HL_TARGET * (e.g. "x86-64-avx"), construct the Target it specifies. Note * that this always starts with the result of get_host_target(), * replacing only the parts found in the target string, so if you * omit (say) an OS specification, the host OS will be used * instead. An empty string is exactly equivalent to * get_host_target(). * * Invalid target strings will fail with a user_error. */ // @{ explicit Target(const std::string &s); explicit Target(const char *s); // @} /** Check if a target string is valid. */ static bool validate_target_string(const std::string &s); void set_feature(Feature f, bool value = true); void set_features(const std::vector &features_to_set, bool value = true); bool has_feature(Feature f) const; inline bool has_feature(halide_target_feature_t f) const { return has_feature((Feature)f); } bool features_any_of(const std::vector &test_features) const; bool features_all_of(const std::vector &test_features) const; /** Return a copy of the target with the given feature set. * This is convenient when enabling certain features (e.g. NoBoundsQuery) * in an initialization list, where the target to be mutated may be * a const reference. */ Target with_feature(Feature f) const; /** Return a copy of the target with the given feature cleared. * This is convenient when disabling certain features (e.g. NoBoundsQuery) * in an initialization list, where the target to be mutated may be * a const reference. */ Target without_feature(Feature f) const; /** Is a fully feature GPU compute runtime enabled? I.e. is * Func::gpu_tile and similar going to work? Currently includes * CUDA, OpenCL, Metal and D3D12Compute. We do not include OpenGL, * because it is not capable of gpgpu, and is not scheduled via * Func::gpu_tile. * TODO: Should OpenGLCompute be included here? */ bool has_gpu_feature() const; /** Does this target allow using a certain type. Generally all * types except 64-bit float and int/uint should be supported by * all backends. * * It is likely better to call the version below which takes a DeviceAPI. */ bool supports_type(const Type &t) const; /** Does this target allow using a certain type on a certain device. * This is the prefered version of this routine. */ bool supports_type(const Type &t, DeviceAPI device) const; /** Returns whether a particular device API can be used with this * Target. */ bool supports_device_api(DeviceAPI api) const; /** If this Target (including all Features) requires a specific DeviceAPI, * return it. If it doesn't, return DeviceAPI::None. If the Target has * features with multiple (different) DeviceAPI requirements, the result * will be an arbitrary DeviceAPI. */ DeviceAPI get_required_device_api() const; bool operator==(const Target &other) const { return os == other.os && arch == other.arch && bits == other.bits && features == other.features; } bool operator!=(const Target &other) const { return !(*this == other); } /** * Create a "greatest common denominator" runtime target that is compatible with * both this target and \p other. Used by generators to conveniently select a suitable * runtime when linking together multiple functions. * * @param other The other target from which we compute the gcd target. * @param[out] result The gcd target if we return true, otherwise unmodified. Can be the same as *this. * @return Whether it was possible to find a compatible target (true) or not. */ bool get_runtime_compatible_target(const Target &other, Target &result); /** Convert the Target into a string form that can be reconstituted * by merge_string(), which will always be of the form * * arch-bits-os-feature1-feature2...featureN. * * Note that is guaranteed that Target(t1.to_string()) == t1, * but not that Target(s).to_string() == s (since there can be * multiple strings that parse to the same Target)... * *unless* t1 contains 'unknown' fields (in which case you'll get a string * that can't be parsed, which is intentional). */ std::string to_string() const; /** Given a data type, return an estimate of the "natural" vector size * for that data type when compiling for this Target. */ int natural_vector_size(const Halide::Type &t) const; /** Given a data type, return an estimate of the "natural" vector size * for that data type when compiling for this Target. */ template int natural_vector_size() const { return natural_vector_size(type_of()); } /** Return true iff 64 bits and has_feature(LargeBuffers). */ bool has_large_buffers() const { return bits == 64 && has_feature(LargeBuffers); } /** Return the maximum buffer size in bytes supported on this * Target. This is 2^31 - 1 except on 64-bit targets when the LargeBuffers * feature is enabled, which expands the maximum to 2^63 - 1. */ int64_t maximum_buffer_size() const { if (has_large_buffers()) { return (((uint64_t)1) << 63) - 1; } else { return (((uint64_t)1) << 31) - 1; } } /** Was libHalide compiled with support for this target? */ bool supported() const; /** Return a bitset of the Featuress set in this Target (set = 1). * Note that while this happens to be the current internal representation, * that might not always be the case. */ const std::bitset &get_features_bitset() const { return features; } /** Return the name corresponding to a given Feature, in the form * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug"). */ static std::string feature_to_name(Target::Feature feature); /** Return the feature corresponding to a given name, in the form * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug"). * If the string is not a known feature name, return FeatureEnd. */ static Target::Feature feature_from_name(const std::string &name); private: /** A bitmask that stores the active features. */ std::bitset features; }; /** Return the target corresponding to the host machine. */ Target get_host_target(); /** Return the target that Halide will use. If HL_TARGET is set it * uses that. Otherwise calls \ref get_host_target */ Target get_target_from_environment(); /** Return the target that Halide will use for jit-compilation. If * HL_JIT_TARGET is set it uses that. Otherwise calls \ref * get_host_target. Throws an error if the architecture, bit width, * and OS of the target do not match the host target, so this is only * useful for controlling the feature set. */ Target get_jit_target_from_environment(); /** Get the Target feature corresponding to a DeviceAPI. For device * apis that do not correspond to any single target feature, returns * Target::FeatureEnd */ Target::Feature target_feature_for_device_api(DeviceAPI api); namespace Internal { void target_test(); } } // namespace Halide #endif namespace Halide { namespace Internal { /** Insert checks to make sure that all referenced parameters meet * their constraints. Also injects any custom requirements provided * by the user. */ Stmt add_parameter_checks(const std::vector &requirements, Stmt s, const Target &t); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_ALIGN_LOADS_H #define HALIDE_ALIGN_LOADS_H /** \file * Defines a lowering pass that rewrites unaligned loads into * sequences of aligned loads. */ namespace Halide { namespace Internal { /** Attempt to rewrite unaligned loads from buffers which are known to * be aligned to instead load aligned vectors that cover the original * load, and then slice the original load out of the aligned * vectors. */ Stmt align_loads(const Stmt &s, int alignment); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_ALLOCATION_BOUNDS_INFERENCE_H #define HALIDE_ALLOCATION_BOUNDS_INFERENCE_H /** \file * Defines the lowering pass that determines how large internal allocations should be. */ #include #include #include namespace Halide { namespace Internal { class Function; /** Take a partially statement with Realize nodes in terms of * variables, and define values for those variables. */ Stmt allocation_bounds_inference(Stmt s, const std::map &env, const std::map, Interval> &func_bounds); } // namespace Internal } // namespace Halide #endif #ifndef APPLY_SPLIT_H #define APPLY_SPLIT_H /** \file * * Defines method that returns a list of let stmts, substitutions, and * predicates to be added given a split schedule. */ #include #include #include #include #ifndef HALIDE_SCHEDULE_H #define HALIDE_SCHEDULE_H /** \file * Defines the internal representation of the schedule for a function */ #include #include #include #include #ifndef HALIDE_FUNCTION_PTR_H #define HALIDE_FUNCTION_PTR_H namespace Halide { namespace Internal { /** Functions are allocated in groups for memory management. Each * group has a ref count associated with it. All within-group * references must be weak. If there are any references from outside * the group, at least one must be strong. Within-group references * may form cycles, but there may not be reference cycles that span * multiple groups. These rules are not enforced automatically. */ struct FunctionGroup; /** The opaque struct describing a Halide function. Wrap it in a * Function object to access it. */ struct FunctionContents; /** A possibly-weak pointer to a Halide function. Take care to follow * the rules mentioned above. Preserves weakness/strength on copy. * * Note that Function objects are always strong pointers to Halide * functions. */ struct FunctionPtr { /** A strong and weak pointer to the group. Only one of these * should be non-zero. */ // @{ IntrusivePtr strong; FunctionGroup *weak = nullptr; // @} /** The index of the function within the group. */ int idx = 0; /** Get a pointer to the group this Function belongs to. */ FunctionGroup *group() const { return weak ? weak : strong.get(); } /** Get the opaque FunctionContents object this pointer refers * to. Wrap it in a Function to do anything interesting with it. */ // @{ FunctionContents *get() const; FunctionContents &operator*() const { return *get(); } FunctionContents *operator->() const { return get(); } // @} /** Convert from a strong reference to a weak reference. Does * nothing if the pointer is undefined, or if the reference is * already weak. */ void weaken() { weak = group(); strong = nullptr; } /** Convert from a weak reference to a strong reference. Does * nothing if the pointer is undefined, or if the reference is * already strong. */ void strengthen() { strong = group(); weak = nullptr; } /** Check if the reference is defined. */ bool defined() const { return weak || strong.defined(); } /** Check if two FunctionPtrs refer to the same Function. */ bool same_as(const FunctionPtr &other) const { return idx == other.idx && group() == other.group(); } /** Pointer comparison, for using FunctionPtrs as keys in maps and * sets. */ bool operator<(const FunctionPtr &other) const { return get() < other.get(); } }; } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_PARAMETER_H #define HALIDE_PARAMETER_H /** \file * Defines the internal representation of parameters to halide piplines */ #include namespace Halide { struct ArgumentEstimates; template class Buffer; struct Expr; struct Type; namespace Internal { struct ParameterContents; /** A reference-counted handle to a parameter to a halide * pipeline. May be a scalar parameter or a buffer */ class Parameter { void check_defined() const; void check_is_buffer() const; void check_is_scalar() const; void check_dim_ok(int dim) const; void check_type(const Type &t) const; protected: IntrusivePtr contents; public: /** Construct a new undefined handle */ Parameter() = default; /** Construct a new parameter of the given type. If the second * argument is true, this is a buffer parameter of the given * dimensionality, otherwise, it is a scalar parameter (and the * dimensionality should be zero). The parameter will be given a * unique auto-generated name. */ Parameter(const Type &t, bool is_buffer, int dimensions); /** Construct a new parameter of the given type with name given by * the third argument. If the second argument is true, this is a * buffer parameter, otherwise, it is a scalar parameter. The * third argument gives the dimensionality of the buffer * parameter. It should be zero for scalar parameters. If the * fifth argument is true, the the name being passed in was * explicitly specified (as opposed to autogenerated). */ Parameter(const Type &t, bool is_buffer, int dimensions, const std::string &name); Parameter(const Parameter &) = default; Parameter &operator=(const Parameter &) = default; Parameter(Parameter &&) = default; Parameter &operator=(Parameter &&) = default; /** Get the type of this parameter */ Type type() const; /** Get the dimensionality of this parameter. Zero for scalars. */ int dimensions() const; /** Get the name of this parameter */ const std::string &name() const; /** Does this parameter refer to a buffer/image? */ bool is_buffer() const; /** If the parameter is a scalar parameter, get its currently * bound value. Only relevant when jitting */ template HALIDE_NO_USER_CODE_INLINE T scalar() const { check_type(type_of()); return *((const T *)(scalar_address())); } /** This returns the current value of scalar() * as an Expr. */ Expr scalar_expr() const; /** If the parameter is a scalar parameter, set its current * value. Only relevant when jitting */ template HALIDE_NO_USER_CODE_INLINE void set_scalar(T val) { check_type(type_of()); *((T *)(scalar_address())) = val; } /** If the parameter is a scalar parameter, set its current * value. Only relevant when jitting */ HALIDE_NO_USER_CODE_INLINE void set_scalar(const Type &val_type, halide_scalar_value_t val) { check_type(val_type); memcpy(scalar_address(), &val, val_type.bytes()); } /** If the parameter is a buffer parameter, get its currently * bound buffer. Only relevant when jitting */ Buffer buffer() const; /** Get the raw currently-bound buffer. null if unbound */ const halide_buffer_t *raw_buffer() const; /** If the parameter is a buffer parameter, set its current * value. Only relevant when jitting */ void set_buffer(const Buffer &b); /** Get the pointer to the current value of the scalar * parameter. For a given parameter, this address will never * change. Only relevant when jitting. */ void *scalar_address() const; /** Tests if this handle is the same as another handle */ bool same_as(const Parameter &other) const; /** Tests if this handle is non-nullptr */ bool defined() const; /** Get and set constraints for the min, extent, stride, and estimates on * the min/extent. */ //@{ void set_min_constraint(int dim, Expr e); void set_extent_constraint(int dim, Expr e); void set_stride_constraint(int dim, Expr e); void set_min_constraint_estimate(int dim, Expr min); void set_extent_constraint_estimate(int dim, Expr extent); void set_host_alignment(int bytes); Expr min_constraint(int dim) const; Expr extent_constraint(int dim) const; Expr stride_constraint(int dim) const; Expr min_constraint_estimate(int dim) const; Expr extent_constraint_estimate(int dim) const; int host_alignment() const; //@} /** Get and set constraints for scalar parameters. These are used * directly by Param, so they must be exported. */ // @{ void set_min_value(const Expr &e); Expr min_value() const; void set_max_value(const Expr &e); Expr max_value() const; void set_estimate(Expr e); Expr estimate() const; // @} /** Order Parameters by their IntrusivePtr so they can be used * to index maps. */ bool operator<(const Parameter &other) const { return contents < other.contents; } /** Get the ArgumentEstimates appropriate for this Parameter. */ ArgumentEstimates get_argument_estimates() const; }; /** Validate arguments to a call to a func, image or imageparam. */ void check_call_arg_types(const std::string &name, std::vector *args, int dims); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_PREFETCH_DIRECTIVE_H #define HALIDE_PREFETCH_DIRECTIVE_H /** \file * Defines the PrefetchDirective struct */ #include namespace Halide { /** Different ways to handle accesses outside the original extents in a prefetch. */ enum class PrefetchBoundStrategy { /** Clamp the prefetched exprs by intersecting the prefetched region with * the original extents. This may make the exprs of the prefetched region * more complicated. */ Clamp, /** Guard the prefetch with if-guards that ignores the prefetch if * any of the prefetched region ever goes beyond the original extents * (i.e. all or nothing). */ GuardWithIf, /** Leave the prefetched exprs as are (no if-guards around the prefetch * and no intersecting with the original extents). This makes the prefetch * exprs simpler but this may cause prefetching of region outside the original * extents. This is good if prefetch won't fault when accessing region * outside the original extents. */ NonFaulting }; namespace Internal { struct PrefetchDirective { std::string name; std::string var; Expr offset; PrefetchBoundStrategy strategy; // If it's a prefetch load from an image parameter, this points to that. Parameter param; }; } // namespace Internal } // namespace Halide #endif // HALIDE_PREFETCH_DIRECTIVE_H namespace Halide { class Func; struct VarOrRVar; namespace Internal { class Function; struct FunctionContents; struct LoopLevelContents; } // namespace Internal /** Different ways to handle a tail case in a split when the * factor does not provably divide the extent. */ enum class TailStrategy { /** Round up the extent to be a multiple of the split * factor. Not legal for RVars, as it would change the meaning * of the algorithm. Pros: generates the simplest, fastest * code. Cons: if used on a stage that reads from the input or * writes to the output, constrains the input or output size * to be a multiple of the split factor. */ RoundUp, /** Guard the inner loop with an if statement that prevents * evaluation beyond the original extent. Always legal. The if * statement is treated like a boundary condition, and * factored out into a loop epilogue if possible. Pros: no * redundant re-evaluation; does not constrain input our * output sizes. Cons: increases code size due to separate * tail-case handling; vectorization will scalarize in the tail * case to handle the if statement. */ GuardWithIf, /** Prevent evaluation beyond the original extent by shifting * the tail case inwards, re-evaluating some points near the * end. Only legal for pure variables in pure definitions. If * the inner loop is very simple, the tail case is treated * like a boundary condition and factored out into an * epilogue. * * This is a good trade-off between several factors. Like * RoundUp, it supports vectorization well, because the inner * loop is always a fixed size with no data-dependent * branching. It increases code size slightly for inner loops * due to the epilogue handling, but not for outer loops * (e.g. loops over tiles). If used on a stage that reads from * an input or writes to an output, this stategy only requires * that the input/output extent be at least the split factor, * instead of a multiple of the split factor as with RoundUp. */ ShiftInwards, /** For pure definitions use ShiftInwards. For pure vars in * update definitions use RoundUp. For RVars in update * definitions use GuardWithIf. */ Auto }; /** Different ways to handle the case when the start/end of the loops of stages * computed with (fused) are not aligned. */ enum class LoopAlignStrategy { /** Shift the start of the fused loops to align. */ AlignStart, /** Shift the end of the fused loops to align. */ AlignEnd, /** compute_with will make no attempt to align the start/end of the * fused loops. */ NoAlign, /** By default, LoopAlignStrategy is set to NoAlign. */ Auto }; /** A reference to a site in a Halide statement at the top of the * body of a particular for loop. Evaluating a region of a halide * function is done by generating a loop nest that spans its * dimensions. We schedule the inputs to that function by * recursively injecting realizations for them at particular sites * in this loop nest. A LoopLevel identifies such a site. The site * can either be a loop nest within all stages of a function * or it can refer to a loop nest within a particular function's * stage (initial definition or updates). * * Note that a LoopLevel is essentially a pointer to an underlying value; * all copies of a LoopLevel refer to the same site, so mutating one copy * (via the set() method) will effectively mutate all copies: \code Func f; Var x; LoopLevel a(f, x); // Both a and b refer to LoopLevel(f, x) LoopLevel b = a; // Now both a and b refer to LoopLevel::root() a.set(LoopLevel::root()); \endcode * This is quite useful when splitting Halide code into utility libraries, as it allows * a library to schedule code according to a caller's specifications, even if the caller * hasn't fully defined its pipeline yet: \code Func demosaic(Func input, LoopLevel intermed_compute_at, LoopLevel intermed_store_at, LoopLevel output_compute_at) { Func intermed = ...; Func output = ...; intermed.compute_at(intermed_compute_at).store_at(intermed_store_at); output.compute_at(output_compute_at); return output; } void process() { // Note that these LoopLevels are all undefined when we pass them to demosaic() LoopLevel intermed_compute_at, intermed_store_at, output_compute_at; Func input = ...; Func demosaiced = demosaic(input, intermed_compute_at, intermed_store_at, output_compute_at); Func output = ...; // We need to ensure all LoopLevels have a well-defined value prior to lowering: intermed_compute_at.set(LoopLevel(output, y)); intermed_store_at.set(LoopLevel(output, y)); output_compute_at.set(LoopLevel(output, x)); } \endcode */ class LoopLevel { Internal::IntrusivePtr contents; explicit LoopLevel(Internal::IntrusivePtr c) : contents(std::move(c)) { } LoopLevel(const std::string &func_name, const std::string &var_name, bool is_rvar, int stage_index, bool locked = false); public: /** Return the index of the function stage associated with this loop level. * Asserts if undefined */ int stage_index() const; /** Identify the loop nest corresponding to some dimension of some function */ // @{ LoopLevel(const Internal::Function &f, const VarOrRVar &v, int stage_index = -1); LoopLevel(const Func &f, const VarOrRVar &v, int stage_index = -1); // @} /** Construct an undefined LoopLevel. Calling any method on an undefined * LoopLevel (other than set()) will assert. */ LoopLevel(); /** Construct a special LoopLevel value that implies * that a function should be inlined away. */ static LoopLevel inlined(); /** Construct a special LoopLevel value which represents the * location outside of all for loops. */ static LoopLevel root(); /** Mutate our contents to match the contents of 'other'. */ void set(const LoopLevel &other); // All the public methods below this point are meant only for internal // use by Halide, rather than user code; hence, they are deliberately // documented with plain comments (rather than Doxygen) to avoid being // present in user documentation. // Lock this LoopLevel. LoopLevel &lock(); // Return the Func name. Asserts if the LoopLevel is_root() or is_inlined() or !defined(). std::string func() const; // Return the VarOrRVar. Asserts if the LoopLevel is_root() or is_inlined() or !defined(). VarOrRVar var() const; // Return true iff the LoopLevel is defined. (Only LoopLevels created // with the default ctor are undefined.) bool defined() const; // Test if a loop level corresponds to inlining the function. bool is_inlined() const; // Test if a loop level is 'root', which describes the site // outside of all for loops. bool is_root() const; // Return a string of the form func.var -- note that this is safe // to call for root or inline LoopLevels, but asserts if !defined(). std::string to_string() const; // Compare this loop level against the variable name of a for // loop, to see if this loop level refers to the site // immediately inside this loop. Asserts if !defined(). bool match(const std::string &loop) const; bool match(const LoopLevel &other) const; // Check if two loop levels are exactly the same. bool operator==(const LoopLevel &other) const; bool operator!=(const LoopLevel &other) const { return !(*this == other); } private: void check_defined() const; void check_locked() const; void check_defined_and_locked() const; }; struct FuseLoopLevel { LoopLevel level; /** Contains alignment strategies for the fused dimensions (indexed by the * dimension name). If not in the map, use the default alignment strategy * to align the fused dimension (see \ref LoopAlignStrategy::Auto). */ std::map align; FuseLoopLevel() : level(LoopLevel::inlined().lock()) { } FuseLoopLevel(const LoopLevel &level, const std::map &align) : level(level), align(align) { } }; namespace Internal { class IRMutator; struct ReductionVariable; struct Split { std::string old_var, outer, inner; Expr factor; bool exact; // Is it required that the factor divides the extent // of the old var. True for splits of RVars. Forces // tail strategy to be GuardWithIf. TailStrategy tail; enum SplitType { SplitVar = 0, RenameVar, FuseVars, PurifyRVar }; // If split_type is Rename, then this is just a renaming of the // old_var to the outer and not a split. The inner var should // be ignored, and factor should be one. Renames are kept in // the same list as splits so that ordering between them is // respected. // If split type is Purify, this replaces the old_var RVar to // the outer Var. The inner var should be ignored, and factor // should be one. // If split_type is Fuse, then this does the opposite of a // split, it joins the outer and inner into the old_var. SplitType split_type; bool is_rename() const { return split_type == RenameVar; } bool is_split() const { return split_type == SplitVar; } bool is_fuse() const { return split_type == FuseVars; } bool is_purify() const { return split_type == PurifyRVar; } }; struct Dim { std::string var; ForType for_type; DeviceAPI device_api; enum Type { PureVar = 0, PureRVar, ImpureRVar }; Type dim_type; bool is_pure() const { return (dim_type == PureVar) || (dim_type == PureRVar); } bool is_rvar() const { return (dim_type == PureRVar) || (dim_type == ImpureRVar); } bool is_unordered_parallel() const { return Halide::Internal::is_unordered_parallel(for_type); } bool is_parallel() const { return Halide::Internal::is_parallel(for_type); } }; struct Bound { std::string var; Expr min, extent, modulus, remainder; }; struct StorageDim { std::string var; Expr alignment; Expr fold_factor; bool fold_forward; }; /** This represents two stages with fused loop nests from outermost to a specific * loop level. The loops to compute func_1(stage_1) are fused with the loops to * compute func_2(stage_2) from outermost to loop level var_name and the * computation from stage_1 of func_1 occurs first. */ struct FusedPair { std::string func_1; std::string func_2; size_t stage_1; size_t stage_2; std::string var_name; FusedPair() = default; FusedPair(const std::string &f1, size_t s1, const std::string &f2, size_t s2, const std::string &var) : func_1(f1), func_2(f2), stage_1(s1), stage_2(s2), var_name(var) { } bool operator==(const FusedPair &other) const { return (func_1 == other.func_1) && (func_2 == other.func_2) && (stage_1 == other.stage_1) && (stage_2 == other.stage_2) && (var_name == other.var_name); } bool operator<(const FusedPair &other) const { if (func_1 != other.func_1) { return func_1 < other.func_1; } if (func_2 != other.func_2) { return func_2 < other.func_2; } if (var_name != other.var_name) { return var_name < other.var_name; } if (stage_1 != other.stage_1) { return stage_1 < other.stage_1; } return stage_2 < other.stage_2; } }; struct FuncScheduleContents; struct StageScheduleContents; struct FunctionContents; /** A schedule for a Function of a Halide pipeline. This schedule is * applied to all stages of the Function. Right now this interface is * basically a struct, offering mutable access to its innards. * In the future it may become more encapsulated. */ class FuncSchedule { IntrusivePtr contents; public: FuncSchedule(IntrusivePtr c) : contents(std::move(c)) { } FuncSchedule(const FuncSchedule &other) : contents(other.contents) { } FuncSchedule(); /** Return a deep copy of this FuncSchedule. It recursively deep copies all * called functions, schedules, specializations, and reduction domains. This * method takes a map of as input * and would use the deep-copied FunctionContents from the map if exists * instead of creating a new deep-copy to avoid creating deep-copies of the * same FunctionContents multiple times. */ FuncSchedule deep_copy( std::map &copied_map) const; /** This flag is set to true if the schedule is memoized. */ // @{ bool &memoized(); bool memoized() const; // @} /** Is the production of this Function done asynchronously */ bool &async(); bool async() const; /** The list and order of dimensions used to store this * function. The first dimension in the vector corresponds to the * innermost dimension for storage (i.e. which dimension is * tightly packed in memory) */ // @{ const std::vector &storage_dims() const; std::vector &storage_dims(); // @} /** The memory type (heap/stack/shared/etc) used to back this Func. */ // @{ MemoryType memory_type() const; MemoryType &memory_type(); // @} /** You may explicitly bound some of the dimensions of a function, * or constrain them to lie on multiples of a given factor. See * \ref Func::bound and \ref Func::align_bounds */ // @{ const std::vector &bounds() const; std::vector &bounds(); // @} /** You may explicitly specify an estimate of some of the function * dimensions. See \ref Func::estimate */ // @{ const std::vector &estimates() const; std::vector &estimates(); // @} /** Mark calls of a function by 'f' to be replaced with its identity * wrapper or clone during the lowering stage. If the string 'f' is empty, * it means replace all calls to the function by all other functions * (excluding itself) in the pipeline with the global identity wrapper. * See \ref Func::in and \ref Func::clone_in for more details. */ // @{ const std::map &wrappers() const; std::map &wrappers(); void add_wrapper(const std::string &f, const Internal::FunctionPtr &wrapper); // @} /** At what sites should we inject the allocation and the * computation of this function? The store_level must be outside * of or equal to the compute_level. If the compute_level is * inline, the store_level is meaningless. See \ref Func::store_at * and \ref Func::compute_at */ // @{ const LoopLevel &store_level() const; const LoopLevel &compute_level() const; LoopLevel &store_level(); LoopLevel &compute_level(); // @} /** Pass an IRVisitor through to all Exprs referenced in the * Schedule. */ void accept(IRVisitor *) const; /** Pass an IRMutator through to all Exprs referenced in the * Schedule. */ void mutate(IRMutator *); }; /** A schedule for a single stage of a Halide pipeline. Right now this * interface is basically a struct, offering mutable access to its * innards. In the future it may become more encapsulated. */ class StageSchedule { IntrusivePtr contents; public: StageSchedule(IntrusivePtr c) : contents(std::move(c)) { } StageSchedule(const StageSchedule &other) : contents(other.contents) { } StageSchedule(); /** Return a copy of this StageSchedule. */ StageSchedule get_copy() const; /** This flag is set to true if the dims list has been manipulated * by the user (or if a ScheduleHandle was created that could have * been used to manipulate it). It controls the warning that * occurs if you schedule the vars of the pure step but not the * update steps. */ // @{ bool &touched(); bool touched() const; // @} /** RVars of reduction domain associated with this schedule if there is any. */ // @{ const std::vector &rvars() const; std::vector &rvars(); // @} /** The traversal of the domain of a function can have some of its * dimensions split into sub-dimensions. See \ref Func::split */ // @{ const std::vector &splits() const; std::vector &splits(); // @} /** The list and ordering of dimensions used to evaluate this * function, after all splits have taken place. The first * dimension in the vector corresponds to the innermost for loop, * and the last is the outermost. Also specifies what type of for * loop to use for each dimension. Does not specify the bounds on * each dimension. These get inferred from how the function is * used, what the splits are, and any optional bounds in the list below. */ // @{ const std::vector &dims() const; std::vector &dims(); // @} /** You may perform prefetching in some of the dimensions of a * function. See \ref Func::prefetch */ // @{ const std::vector &prefetches() const; std::vector &prefetches(); // @} /** Innermost loop level of fused loop nest for this function stage. * Fusion runs from outermost to this loop level. The stages being fused * should not have producer/consumer relationship. See \ref Func::compute_with * and \ref Stage::compute_with */ // @{ const FuseLoopLevel &fuse_level() const; FuseLoopLevel &fuse_level(); // @} /** List of function stages that are to be fused with this function stage * from the outermost loop to a certain loop level. Those function stages * are to be computed AFTER this function stage at the last fused loop level. * This list is populated when realization_order() is called. See * \ref Func::compute_with and \ref Stage::compute_with */ // @{ const std::vector &fused_pairs() const; std::vector &fused_pairs(); /** Are race conditions permitted? */ // @{ bool allow_race_conditions() const; bool &allow_race_conditions(); // @} /** Use atomic update? */ // @{ bool atomic() const; bool &atomic(); // @} /** Atomic updates are only allowed on associative reductions. * We try to prove the associativity, but the user can override * the associativity test and suppress compiler error if the prover * fails to recognize the associativity or the user does not care. */ // @{ bool override_atomic_associativity_test() const; bool &override_atomic_associativity_test(); // @} /** Pass an IRVisitor through to all Exprs referenced in the * Schedule. */ void accept(IRVisitor *) const; /** Pass an IRMutator through to all Exprs referenced in the * Schedule. */ void mutate(IRMutator *); }; } // namespace Internal } // namespace Halide #endif namespace Halide { namespace Internal { struct ApplySplitResult { // If type is "Substitution", then this represents a substitution of // variable "name" to value. If type is "LetStmt", we should insert a new // let stmt defining "name" with value "value". If type is "Predicate", we // should ignore "name" and the predicate is "value". std::string name; Expr value; enum Type { Substitution = 0, LetStmt, Predicate }; Type type; ApplySplitResult(const std::string &n, Expr val, Type t) : name(n), value(std::move(val)), type(t) { } ApplySplitResult(Expr val) : name(""), value(std::move(val)), type(Predicate) { } bool is_substitution() const { return (type == Substitution); } bool is_let() const { return (type == LetStmt); } bool is_predicate() const { return (type == Predicate); } }; /** Given a Split schedule on a definition (init or update), return a list of * of predicates on the definition, substitutions that needs to be applied to * the definition (in ascending order of application), and let stmts which * defined the values of variables referred by the predicates and substitutions * (ordered from innermost to outermost let). */ std::vector apply_split( const Split &split, bool is_update, const std::string &prefix, std::map &dim_extent_alignment); /** Compute the loop bounds of the new dimensions resulting from applying the * split schedules using the loop bounds of the old dimensions. */ std::vector> compute_loop_bounds_after_split( const Split &split, const std::string &prefix); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_ARGUMENT_H #define HALIDE_ARGUMENT_H /** \file * Defines a type used for expressing the type signature of a * generated halide pipeline */ namespace Halide { template class Buffer; struct ArgumentEstimates { /** If this is a scalar argument, then these are its default, min, max, and estimated values. * For buffer arguments, all should be undefined. */ Expr scalar_def, scalar_min, scalar_max, scalar_estimate; /** If this is a buffer argument, these are the estimated min and * extent for each dimension. If there are no estimates, * buffer_estimates.size() can be zero; otherwise, it must always * equal the dimensions */ Region buffer_estimates; bool operator==(const ArgumentEstimates &rhs) const; }; /** * A struct representing an argument to a halide-generated * function. Used for specifying the function signature of * generated code. */ struct Argument { /** The name of the argument */ std::string name; /** An argument is either a primitive type (for parameters), or a * buffer pointer. * * If kind == InputScalar, then type fully encodes the expected type * of the scalar argument. * * If kind == InputBuffer|OutputBuffer, then type.bytes() should be used * to determine* elem_size of the buffer; additionally, type.code *should* * reflect the expected interpretation of the buffer data (e.g. float vs int), * but there is no runtime enforcement of this at present. */ enum Kind { InputScalar = halide_argument_kind_input_scalar, InputBuffer = halide_argument_kind_input_buffer, OutputBuffer = halide_argument_kind_output_buffer }; Kind kind = InputScalar; /** If kind == InputBuffer|OutputBuffer, this is the dimensionality of the buffer. * If kind == InputScalar, this value is ignored (and should always be set to zero) */ uint8_t dimensions = 0; /** If this is a scalar parameter, then this is its type. * * If this is a buffer parameter, this this is the type of its * elements. * * Note that type.lanes should always be 1 here. */ Type type; /* The estimates (if any) and default/min/max values (if any) for this Argument. */ ArgumentEstimates argument_estimates; Argument() = default; Argument(const std::string &_name, Kind _kind, const Type &_type, int _dimensions, const ArgumentEstimates &argument_estimates); // Not explicit, so that you can put Buffer in an argument list, // to indicate that it shouldn't be baked into the object file, // but instead received as an argument at runtime template Argument(Buffer im) : name(im.name()), kind(InputBuffer), dimensions(im.dimensions()), type(im.type()) { } bool is_buffer() const { return kind == InputBuffer || kind == OutputBuffer; } bool is_scalar() const { return kind == InputScalar; } bool is_input() const { return kind == InputScalar || kind == InputBuffer; } bool is_output() const { return kind == OutputBuffer; } bool operator==(const Argument &rhs) const { return name == rhs.name && kind == rhs.kind && dimensions == rhs.dimensions && type == rhs.type && argument_estimates == rhs.argument_estimates; } }; } // namespace Halide #endif #ifndef HALIDE_ASSOCIATIVE_OPS_TABLE_H #define HALIDE_ASSOCIATIVE_OPS_TABLE_H /** \file * Tables listing associative operators and their identities. */ #ifndef HALIDE_IR_EQUALITY_H #define HALIDE_IR_EQUALITY_H /** \file * Methods to test Exprs and Stmts for equality of value */ namespace Halide { namespace Internal { /** A compare struct suitable for use in std::map and std::set that * computes a lexical ordering on IR nodes. */ struct IRDeepCompare { bool operator()(const Expr &a, const Expr &b) const; bool operator()(const Stmt &a, const Stmt &b) const; }; /** Lossily track known equal exprs with a cache. On collision, the * old pair is evicted. Used below by ExprWithCompareCache. */ class IRCompareCache { private: struct Entry { Expr a, b; }; int bits; uint32_t hash(const Expr &a, const Expr &b) const { // Note this hash is symmetric in a and b, so that a // comparison in a and b hashes to the same bucket as // a comparison on b and a. uint64_t pa = (uint64_t)(a.get()); uint64_t pb = (uint64_t)(b.get()); uint64_t mix = (pa + pb) + (pa ^ pb); mix ^= (mix >> bits); mix ^= (mix >> (bits * 2)); uint32_t bottom = mix & ((1 << bits) - 1); return bottom; } std::vector entries; public: void insert(const Expr &a, const Expr &b) { uint32_t h = hash(a, b); entries[h].a = a; entries[h].b = b; } bool contains(const Expr &a, const Expr &b) const { uint32_t h = hash(a, b); const Entry &e = entries[h]; return ((a.same_as(e.a) && b.same_as(e.b)) || (a.same_as(e.b) && b.same_as(e.a))); } void clear() { for (size_t i = 0; i < entries.size(); i++) { entries[i].a = Expr(); entries[i].b = Expr(); } } IRCompareCache() = default; IRCompareCache(int b) : bits(b), entries(static_cast(1) << bits) { } }; /** A wrapper about Exprs so that they can be deeply compared with a * cache for known-equal subexpressions. Useful for unsanitized Exprs * coming in from the front-end, which may be horrible graphs with * sub-expressions that are equal by value but not by identity. This * isn't a comparison object like IRDeepCompare above, because libc++ * requires that comparison objects be stateless (and constructs a new * one for each comparison!), so they can't have a cache associated * with them. However, by sneakily making the cache a mutable member * of the objects being compared, we can dodge this issue. * * Clunky example usage: * \code Expr a, b, c, query; std::set s; IRCompareCache cache(8); s.insert(ExprWithCompareCache(a, &cache)); s.insert(ExprWithCompareCache(b, &cache)); s.insert(ExprWithCompareCache(c, &cache)); if (m.contains(ExprWithCompareCache(query, &cache))) {...} \endcode * */ struct ExprWithCompareCache { Expr expr; mutable IRCompareCache *cache; ExprWithCompareCache() : cache(nullptr) { } ExprWithCompareCache(const Expr &e, IRCompareCache *c) : expr(e), cache(c) { } /** The comparison uses (and updates) the cache */ bool operator<(const ExprWithCompareCache &other) const; }; /** Compare IR nodes for equality of value. Traverses entire IR * tree. For equality of reference, use Expr::same_as. If you're * comparing non-CSE'd Exprs, use graph_equal, which is safe for nasty * graphs of IR nodes. */ // @{ bool equal(const Expr &a, const Expr &b); bool equal(const Stmt &a, const Stmt &b); bool graph_equal(const Expr &a, const Expr &b); bool graph_equal(const Stmt &a, const Stmt &b); // @} void ir_equality_test(); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_IR_OPERATOR_H #define HALIDE_IR_OPERATOR_H /** \file * * Defines various operator overloads and utility functions that make * it more pleasant to work with Halide expressions. */ #include #ifndef HALIDE_TUPLE_H #define HALIDE_TUPLE_H /** \file * * Defines Tuple - the front-end handle on small arrays of expressions. */ #include namespace Halide { class FuncRef; /** Create a small array of Exprs for defining and calling functions * with multiple outputs. */ class Tuple { private: std::vector exprs; public: /** The number of elements in the tuple. */ size_t size() const { return exprs.size(); } /** Get a reference to an element. */ Expr &operator[](size_t x) { user_assert(x < exprs.size()) << "Tuple access out of bounds\n"; return exprs[x]; } /** Get a copy of an element. */ Expr operator[](size_t x) const { user_assert(x < exprs.size()) << "Tuple access out of bounds\n"; return exprs[x]; } /** Construct a Tuple of a single Expr */ explicit Tuple(Expr e) { exprs.emplace_back(std::move(e)); } /** Construct a Tuple from some Exprs. */ //@{ template Tuple(const Expr &a, const Expr &b, Args &&... args) { exprs = std::vector{a, b, std::forward(args)...}; } //@} /** Construct a Tuple from a vector of Exprs */ explicit HALIDE_NO_USER_CODE_INLINE Tuple(const std::vector &e) : exprs(e) { user_assert(!e.empty()) << "Tuples must have at least one element\n"; } /** Construct a Tuple from a function reference. */ Tuple(const FuncRef &); /** Treat the tuple as a vector of Exprs */ const std::vector &as_vector() const { return exprs; } }; } // namespace Halide #endif namespace Halide { namespace Internal { /** Is the expression either an IntImm, a FloatImm, a StringImm, or a * Cast of the same, or a Ramp or Broadcast of the same. Doesn't do * any constant folding. */ bool is_const(const Expr &e); /** Is the expression an IntImm, FloatImm of a particular value, or a * Cast, or Broadcast of the same. */ bool is_const(const Expr &e, int64_t v); /** If an expression is an IntImm or a Broadcast of an IntImm, return * a pointer to its value. Otherwise returns nullptr. */ const int64_t *as_const_int(const Expr &e); /** If an expression is a UIntImm or a Broadcast of a UIntImm, return * a pointer to its value. Otherwise returns nullptr. */ const uint64_t *as_const_uint(const Expr &e); /** If an expression is a FloatImm or a Broadcast of a FloatImm, * return a pointer to its value. Otherwise returns nullptr. */ const double *as_const_float(const Expr &e); /** Is the expression a constant integer power of two. Also returns * log base two of the expression if it is. Only returns true for * integer types. */ bool is_const_power_of_two_integer(const Expr &e, int *bits); /** Is the expression a const (as defined by is_const), and also * strictly greater than zero (in all lanes, if a vector expression) */ bool is_positive_const(const Expr &e); /** Is the expression a const (as defined by is_const), and also * strictly less than zero (in all lanes, if a vector expression) */ bool is_negative_const(const Expr &e); /** Is the expression a const (as defined by is_const), and also * strictly less than zero (in all lanes, if a vector expression) and * is its negative value representable. (This excludes the most * negative value of the Expr's type from inclusion. Intended to be * used when the value will be negated as part of simplification.) */ bool is_negative_negatable_const(const Expr &e); /** Is the expression an undef */ bool is_undef(const Expr &e); /** Is the expression a const (as defined by is_const), and also equal * to zero (in all lanes, if a vector expression) */ bool is_zero(const Expr &e); /** Is the expression a const (as defined by is_const), and also equal * to one (in all lanes, if a vector expression) */ bool is_one(const Expr &e); /** Is the expression a const (as defined by is_const), and also equal * to two (in all lanes, if a vector expression) */ bool is_two(const Expr &e); /** Is the statement a no-op (which we represent as either an * undefined Stmt, or as an Evaluate node of a constant) */ bool is_no_op(const Stmt &s); /** Does the expression * 1) Take on the same value no matter where it appears in a Stmt, and * 2) Evaluating it has no side-effects */ bool is_pure(const Expr &e); /** Construct an immediate of the given type from any numeric C++ type. */ // @{ Expr make_const(Type t, int64_t val); Expr make_const(Type t, uint64_t val); Expr make_const(Type t, double val); inline Expr make_const(Type t, int32_t val) { return make_const(t, (int64_t)val); } inline Expr make_const(Type t, uint32_t val) { return make_const(t, (uint64_t)val); } inline Expr make_const(Type t, int16_t val) { return make_const(t, (int64_t)val); } inline Expr make_const(Type t, uint16_t val) { return make_const(t, (uint64_t)val); } inline Expr make_const(Type t, int8_t val) { return make_const(t, (int64_t)val); } inline Expr make_const(Type t, uint8_t val) { return make_const(t, (uint64_t)val); } inline Expr make_const(Type t, bool val) { return make_const(t, (uint64_t)val); } inline Expr make_const(Type t, float val) { return make_const(t, (double)val); } inline Expr make_const(Type t, float16_t val) { return make_const(t, (double)val); } // @} /** Construct a unique signed_integer_overflow Expr */ Expr make_signed_integer_overflow(Type type); /** Check if a constant value can be correctly represented as the given type. */ void check_representable(Type t, int64_t val); /** Construct a boolean constant from a C++ boolean value. * May also be a vector if width is given. * It is not possible to coerce a C++ boolean to Expr because * if we provide such a path then char objects can ambiguously * be converted to Halide Expr or to std::string. The problem * is that C++ does not have a real bool type - it is in fact * close enough to char that C++ does not know how to distinguish them. * make_bool is the explicit coercion. */ Expr make_bool(bool val, int lanes = 1); /** Construct the representation of zero in the given type */ Expr make_zero(Type t); /** Construct the representation of one in the given type */ Expr make_one(Type t); /** Construct the representation of two in the given type */ Expr make_two(Type t); /** Construct the constant boolean true. May also be a vector of * trues, if a lanes argument is given. */ Expr const_true(int lanes = 1); /** Construct the constant boolean false. May also be a vector of * falses, if a lanes argument is given. */ Expr const_false(int lanes = 1); /** Attempt to cast an expression to a smaller type while provably not * losing information. If it can't be done, return an undefined * Expr. */ Expr lossless_cast(Type t, Expr e); /** Coerce the two expressions to have the same type, using C-style * casting rules. For the purposes of casting, a boolean type is * UInt(1). We use the following procedure: * * If the types already match, do nothing. * * Then, if one type is a vector and the other is a scalar, the scalar * is broadcast to match the vector width, and we continue. * * Then, if one type is floating-point and the other is not, the * non-float is cast to the floating-point type, and we're done. * * Then, if both types are unsigned ints, the one with fewer bits is * cast to match the one with more bits and we're done. * * Then, if both types are signed ints, the one with fewer bits is * cast to match the one with more bits and we're done. * * Finally, if one type is an unsigned int and the other type is a signed * int, both are cast to a signed int with the greater of the two * bit-widths. For example, matching an Int(8) with a UInt(16) results * in an Int(16). * */ void match_types(Expr &a, Expr &b); /** Asserts that both expressions are integer types and are either * both signed or both unsigned. If one argument is scalar and the * other a vector, the scalar is broadcasted to have the same number * of lanes as the vector. If one expression is of narrower type than * the other, it is widened to the bit width of the wider. */ void match_types_bitwise(Expr &a, Expr &b, const char *op_name); /** Halide's vectorizable transcendentals. */ // @{ Expr halide_log(const Expr &a); Expr halide_exp(const Expr &a); Expr halide_erf(const Expr &a); // @} /** Raise an expression to an integer power by repeatedly multiplying * it by itself. */ Expr raise_to_integer_power(Expr a, int64_t b); /** Split a boolean condition into vector of ANDs. If 'cond' is undefined, * return an empty vector. */ void split_into_ands(const Expr &cond, std::vector &result); /** A builder to help create Exprs representing halide_buffer_t * structs (e.g. foo.buffer) via calls to halide_buffer_init. Fill out * the fields and then call build. The resulting Expr will be a call * to halide_buffer_init with the struct members as arguments. If the * buffer_memory field is undefined, it uses a call to alloca to make * some stack memory for the buffer. If the shape_memory field is * undefined, it similarly uses stack memory for the shape. If the * shape_memory field is null, it uses the dim field already in the * buffer. Other unitialized fields will take on a value of zero in * the constructed buffer. */ struct BufferBuilder { Expr buffer_memory, shape_memory; Expr host, device, device_interface; Type type; int dimensions = 0; std::vector mins, extents, strides; Expr host_dirty, device_dirty; Expr build() const; }; /** If e is a ramp expression with stride, default 1, return the base, * otherwise undefined. */ Expr strided_ramp_base(const Expr &e, int stride = 1); /** Implementations of division and mod that are specific to Halide. * Use these implementations; do not use native C division or mod to * simplify Halide expressions. Halide division and modulo satisify * the Euclidean definition of division for integers a and b: * /code when b != 0, (a/b)*b + a%b = a 0 <= a%b < |b| /endcode * * Additionally, mod by zero returns zero, and div by zero returns * zero. This makes mod and div total functions. */ // @{ template inline T mod_imp(T a, T b) { Type t = type_of(); if (!t.is_float() && b == 0) { return 0; } else if (t.is_int()) { int64_t ia = a; int64_t ib = b; int64_t a_neg = ia >> 63; int64_t b_neg = ib >> 63; int64_t b_zero = (ib == 0) ? -1 : 0; ia -= a_neg; int64_t r = ia % (ib | b_zero); r += (a_neg & ((ib ^ b_neg) + ~b_neg)); r &= ~b_zero; return r; } else { return a % b; } } template inline T div_imp(T a, T b) { Type t = type_of(); if (!t.is_float() && b == 0) { return (T)0; } else if (t.is_int()) { // Do it as 64-bit int64_t ia = a; int64_t ib = b; int64_t a_neg = ia >> 63; int64_t b_neg = ib >> 63; int64_t b_zero = (ib == 0) ? -1 : 0; ib -= b_zero; ia -= a_neg; int64_t q = ia / ib; q += a_neg & (~b_neg - b_neg); q &= ~b_zero; return (T)q; } else { return a / b; } } // @} // Special cases for float, double. template<> inline float mod_imp(float a, float b) { float f = a - b * (floorf(a / b)); // The remainder has the same sign as b. return f; } template<> inline double mod_imp(double a, double b) { double f = a - b * (std::floor(a / b)); return f; } template<> inline float div_imp(float a, float b) { return a / b; } template<> inline double div_imp(double a, double b) { return a / b; } /** Return an Expr that is identical to the input Expr, but with * all calls to likely() and likely_if_innermost() removed. */ Expr remove_likelies(const Expr &e); /** Return a Stmt that is identical to the input Stmt, but with * all calls to likely() and likely_if_innermost() removed. */ Stmt remove_likelies(const Stmt &s); // Secondary args to print can be Exprs or const char * inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector &args) { } template inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector &args, const char *arg, Args &&... more_args) { args.emplace_back(std::string(arg)); collect_print_args(args, std::forward(more_args)...); } template inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector &args, Expr arg, Args &&... more_args) { args.push_back(std::move(arg)); collect_print_args(args, std::forward(more_args)...); } Expr requirement_failed_error(Expr condition, const std::vector &args); Expr memoize_tag_helper(Expr result, const std::vector &cache_key_values); } // namespace Internal /** Cast an expression to the halide type corresponding to the C++ type T. */ template inline Expr cast(Expr a) { return cast(type_of(), std::move(a)); } /** Cast an expression to a new type. */ Expr cast(Type t, Expr a); /** Return the sum of two expressions, doing any necessary type * coercion using \ref Internal::match_types */ Expr operator+(Expr a, Expr b); /** Add an expression and a constant integer. Coerces the type of the * integer to match the type of the expression. Errors if the integer * cannot be represented in the type of the expression. */ // @{ Expr operator+(Expr a, int b); /** Add a constant integer and an expression. Coerces the type of the * integer to match the type of the expression. Errors if the integer * cannot be represented in the type of the expression. */ Expr operator+(int a, Expr b); /** Modify the first expression to be the sum of two expressions, * without changing its type. This casts the second argument to match * the type of the first. */ Expr &operator+=(Expr &a, Expr b); /** Return the difference of two expressions, doing any necessary type * coercion using \ref Internal::match_types */ Expr operator-(Expr a, Expr b); /** Subtracts a constant integer from an expression. Coerces the type of the * integer to match the type of the expression. Errors if the integer * cannot be represented in the type of the expression. */ Expr operator-(Expr a, int b); /** Subtracts an expression from a constant integer. Coerces the type * of the integer to match the type of the expression. Errors if the * integer cannot be represented in the type of the expression. */ Expr operator-(int a, Expr b); /** Return the negative of the argument. Does no type casting, so more * formally: return that number which when added to the original, * yields zero of the same type. For unsigned integers the negative is * still an unsigned integer. E.g. in UInt(8), the negative of 56 is * 200, because 56 + 200 == 0 */ Expr operator-(Expr a); /** Modify the first expression to be the difference of two expressions, * without changing its type. This casts the second argument to match * the type of the first. */ Expr &operator-=(Expr &a, Expr b); /** Return the product of two expressions, doing any necessary type * coercion using \ref Internal::match_types */ Expr operator*(Expr a, Expr b); /** Multiply an expression and a constant integer. Coerces the type of the * integer to match the type of the expression. Errors if the integer * cannot be represented in the type of the expression. */ Expr operator*(Expr a, int b); /** Multiply a constant integer and an expression. Coerces the type of * the integer to match the type of the expression. Errors if the * integer cannot be represented in the type of the expression. */ Expr operator*(int a, Expr b); /** Modify the first expression to be the product of two expressions, * without changing its type. This casts the second argument to match * the type of the first. */ Expr &operator*=(Expr &a, Expr b); /** Return the ratio of two expressions, doing any necessary type * coercion using \ref Internal::match_types. Note that integer * division in Halide is not the same as integer division in C-like * languages in two ways. * * First, signed integer division in Halide rounds according to the * sign of the denominator. This means towards minus infinity for * positive denominators, and towards positive infinity for negative * denominators. This is unlike C, which rounds towards zero. This * decision ensures that upsampling expressions like f(x/2, y/2) don't * have funny discontinuities when x and y cross zero. * * Second, division by zero returns zero instead of faulting. For * types where overflow is defined behavior, division of the largest * negative signed integer by -1 returns the larged negative signed * integer for the type (i.e. it wraps). This ensures that a division * operation can never have a side-effect, which is helpful in Halide * because scheduling directives can expand the domain of computation * of a Func, potentially introducing new zero-division. */ Expr operator/(Expr a, Expr b); /** Modify the first expression to be the ratio of two expressions, * without changing its type. This casts the second argument to match * the type of the first. Note that signed integer division in Halide * rounds towards minus infinity, unlike C, which rounds towards * zero. */ Expr &operator/=(Expr &a, Expr b); /** Divides an expression by a constant integer. Coerces the type * of the integer to match the type of the expression. Errors if the * integer cannot be represented in the type of the expression. */ Expr operator/(Expr a, int b); /** Divides a constant integer by an expression. Coerces the type * of the integer to match the type of the expression. Errors if the * integer cannot be represented in the type of the expression. */ Expr operator/(int a, Expr b); /** Return the first argument reduced modulo the second, doing any * necessary type coercion using \ref Internal::match_types. There are * two key differences between C-like languages and Halide for the * modulo operation, which complement the way division works. * * First, the result is never negative, so x % 2 is always zero or * one, unlike in C-like languages. x % -2 is equivalent, and is also * always zero or one. Second, mod by zero evaluates to zero (unlike * in C, where it faults). This makes modulo, like division, a * side-effect-free operation. */ Expr operator%(Expr a, Expr b); /** Mods an expression by a constant integer. Coerces the type * of the integer to match the type of the expression. Errors if the * integer cannot be represented in the type of the expression. */ Expr operator%(Expr a, int b); /** Mods a constant integer by an expression. Coerces the type * of the integer to match the type of the expression. Errors if the * integer cannot be represented in the type of the expression. */ Expr operator%(int a, Expr b); /** Return a boolean expression that tests whether the first argument * is greater than the second, after doing any necessary type coercion * using \ref Internal::match_types */ Expr operator>(Expr a, Expr b); /** Return a boolean expression that tests whether an expression is * greater than a constant integer. Coerces the integer to the type of * the expression. Errors if the integer is not representable in that * type. */ Expr operator>(Expr a, int b); /** Return a boolean expression that tests whether a constant integer is * greater than an expression. Coerces the integer to the type of * the expression. Errors if the integer is not representable in that * type. */ Expr operator>(int a, Expr b); /** Return a boolean expression that tests whether the first argument * is less than the second, after doing any necessary type coercion * using \ref Internal::match_types */ Expr operator<(Expr a, Expr b); /** Return a boolean expression that tests whether an expression is * less than a constant integer. Coerces the integer to the type of * the expression. Errors if the integer is not representable in that * type. */ Expr operator<(Expr a, int b); /** Return a boolean expression that tests whether a constant integer is * less than an expression. Coerces the integer to the type of * the expression. Errors if the integer is not representable in that * type. */ Expr operator<(int a, Expr b); /** Return a boolean expression that tests whether the first argument * is less than or equal to the second, after doing any necessary type * coercion using \ref Internal::match_types */ Expr operator<=(Expr a, Expr b); /** Return a boolean expression that tests whether an expression is * less than or equal to a constant integer. Coerces the integer to * the type of the expression. Errors if the integer is not * representable in that type. */ Expr operator<=(Expr a, int b); /** Return a boolean expression that tests whether a constant integer * is less than or equal to an expression. Coerces the integer to the * type of the expression. Errors if the integer is not representable * in that type. */ Expr operator<=(int a, Expr b); /** Return a boolean expression that tests whether the first argument * is greater than or equal to the second, after doing any necessary * type coercion using \ref Internal::match_types */ Expr operator>=(Expr a, Expr b); /** Return a boolean expression that tests whether an expression is * greater than or equal to a constant integer. Coerces the integer to * the type of the expression. Errors if the integer is not * representable in that type. */ Expr operator>=(const Expr &a, int b); /** Return a boolean expression that tests whether a constant integer * is greater than or equal to an expression. Coerces the integer to the * type of the expression. Errors if the integer is not representable * in that type. */ Expr operator>=(int a, const Expr &b); /** Return a boolean expression that tests whether the first argument * is equal to the second, after doing any necessary type coercion * using \ref Internal::match_types */ Expr operator==(Expr a, Expr b); /** Return a boolean expression that tests whether an expression is * equal to a constant integer. Coerces the integer to the type of the * expression. Errors if the integer is not representable in that * type. */ Expr operator==(Expr a, int b); /** Return a boolean expression that tests whether a constant integer * is equal to an expression. Coerces the integer to the type of the * expression. Errors if the integer is not representable in that * type. */ Expr operator==(int a, Expr b); /** Return a boolean expression that tests whether the first argument * is not equal to the second, after doing any necessary type coercion * using \ref Internal::match_types */ Expr operator!=(Expr a, Expr b); /** Return a boolean expression that tests whether an expression is * not equal to a constant integer. Coerces the integer to the type of * the expression. Errors if the integer is not representable in that * type. */ Expr operator!=(Expr a, int b); /** Return a boolean expression that tests whether a constant integer * is not equal to an expression. Coerces the integer to the type of * the expression. Errors if the integer is not representable in that * type. */ Expr operator!=(int a, Expr b); /** Returns the logical and of the two arguments */ Expr operator&&(Expr a, Expr b); /** Logical and of an Expr and a bool. Either returns the Expr or an * Expr representing false, depending on the bool. */ // @{ Expr operator&&(Expr a, bool b); Expr operator&&(bool a, Expr b); // @} /** Returns the logical or of the two arguments */ Expr operator||(Expr a, Expr b); /** Logical or of an Expr and a bool. Either returns the Expr or an * Expr representing true, depending on the bool. */ // @{ Expr operator||(Expr a, bool b); Expr operator||(bool a, Expr b); // @} /** Returns the logical not the argument */ Expr operator!(Expr a); /** Returns an expression representing the greater of the two * arguments, after doing any necessary type coercion using * \ref Internal::match_types. Vectorizes cleanly on most platforms * (with the exception of integer types on x86 without SSE4). */ Expr max(Expr a, Expr b); /** Returns an expression representing the greater of an expression * and a constant integer. The integer is coerced to the type of the * expression. Errors if the integer is not representable as that * type. Vectorizes cleanly on most platforms (with the exception of * integer types on x86 without SSE4). */ Expr max(Expr a, int b); /** Returns an expression representing the greater of a constant * integer and an expression. The integer is coerced to the type of * the expression. Errors if the integer is not representable as that * type. Vectorizes cleanly on most platforms (with the exception of * integer types on x86 without SSE4). */ Expr max(int a, Expr b); inline Expr max(float a, Expr b) { return max(Expr(a), std::move(b)); } inline Expr max(Expr a, float b) { return max(std::move(a), Expr(b)); } /** Returns an expression representing the greater of an expressions * vector, after doing any necessary type coersion using * \ref Internal::match_types. Vectorizes cleanly on most platforms * (with the exception of integer types on x86 without SSE4). * The expressions are folded from right ie. max(.., max(.., ..)). * The arguments can be any mix of types but must all be convertible to Expr. */ template::value>::type * = nullptr> inline Expr max(A &&a, B &&b, C &&c, Rest &&... rest) { return max(std::forward(a), max(std::forward(b), std::forward(c), std::forward(rest)...)); } Expr min(Expr a, Expr b); /** Returns an expression representing the lesser of an expression * and a constant integer. The integer is coerced to the type of the * expression. Errors if the integer is not representable as that * type. Vectorizes cleanly on most platforms (with the exception of * integer types on x86 without SSE4). */ Expr min(Expr a, int b); /** Returns an expression representing the lesser of a constant * integer and an expression. The integer is coerced to the type of * the expression. Errors if the integer is not representable as that * type. Vectorizes cleanly on most platforms (with the exception of * integer types on x86 without SSE4). */ Expr min(int a, Expr b); inline Expr min(float a, Expr b) { return min(Expr(a), std::move(b)); } inline Expr min(Expr a, float b) { return min(std::move(a), Expr(b)); } /** Returns an expression representing the lesser of an expressions * vector, after doing any necessary type coersion using * \ref Internal::match_types. Vectorizes cleanly on most platforms * (with the exception of integer types on x86 without SSE4). * The expressions are folded from right ie. min(.., min(.., ..)). * The arguments can be any mix of types but must all be convertible to Expr. */ template::value>::type * = nullptr> inline Expr min(A &&a, B &&b, C &&c, Rest &&... rest) { return min(std::forward(a), min(std::forward(b), std::forward(c), std::forward(rest)...)); } /** Operators on floats treats those floats as Exprs. Making these * explicit prevents implicit float->int casts that might otherwise * occur. */ // @{ inline Expr operator+(Expr a, float b) { return std::move(a) + Expr(b); } inline Expr operator+(float a, Expr b) { return Expr(a) + std::move(b); } inline Expr operator-(Expr a, float b) { return std::move(a) - Expr(b); } inline Expr operator-(float a, Expr b) { return Expr(a) - std::move(b); } inline Expr operator*(Expr a, float b) { return std::move(a) * Expr(b); } inline Expr operator*(float a, Expr b) { return Expr(a) * std::move(b); } inline Expr operator/(Expr a, float b) { return std::move(a) / Expr(b); } inline Expr operator/(float a, Expr b) { return Expr(a) / std::move(b); } inline Expr operator%(Expr a, float b) { return std::move(a) % Expr(b); } inline Expr operator%(float a, Expr b) { return Expr(a) % std::move(b); } inline Expr operator>(Expr a, float b) { return std::move(a) > Expr(b); } inline Expr operator>(float a, Expr b) { return Expr(a) > std::move(b); } inline Expr operator<(Expr a, float b) { return std::move(a) < Expr(b); } inline Expr operator<(float a, Expr b) { return Expr(a) < std::move(b); } inline Expr operator>=(Expr a, float b) { return std::move(a) >= Expr(b); } inline Expr operator>=(float a, Expr b) { return Expr(a) >= std::move(b); } inline Expr operator<=(Expr a, float b) { return std::move(a) <= Expr(b); } inline Expr operator<=(float a, Expr b) { return Expr(a) <= std::move(b); } inline Expr operator==(Expr a, float b) { return std::move(a) == Expr(b); } inline Expr operator==(float a, Expr b) { return Expr(a) == std::move(b); } inline Expr operator!=(Expr a, float b) { return std::move(a) != Expr(b); } inline Expr operator!=(float a, Expr b) { return Expr(a) != std::move(b); } // @} /** Clamps an expression to lie within the given bounds. The bounds * are type-cast to match the expression. Vectorizes as well as min/max. */ Expr clamp(Expr a, const Expr &min_val, const Expr &max_val); /** Returns the absolute value of a signed integer or floating-point * expression. Vectorizes cleanly. Unlike in C, abs of a signed * integer returns an unsigned integer of the same bit width. This * means that abs of the most negative integer doesn't overflow. */ Expr abs(Expr a); /** Return the absolute difference between two values. Vectorizes * cleanly. Returns an unsigned value of the same bit width. There are * various ways to write this yourself, but they contain numerous * gotchas and don't always compile to good code, so use this * instead. */ Expr absd(Expr a, Expr b); /** Returns an expression similar to the ternary operator in C, except * that it always evaluates all arguments. If the first argument is * true, then return the second, else return the third. Typically * vectorizes cleanly, but benefits from SSE41 or newer on x86. */ Expr select(Expr condition, Expr true_value, Expr false_value); /** A multi-way variant of select similar to a switch statement in C, * which can accept multiple conditions and values in pairs. Evaluates * to the first value for which the condition is true. Returns the * final value if all conditions are false. */ template::value>::type * = nullptr> inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&... args) { return select(std::move(c0), std::move(v0), select(std::move(c1), std::move(v1), std::forward(args)...)); } /** Equivalent of ternary select(), but taking/returning tuples. If the condition is * a Tuple, it must match the size of the true and false Tuples. */ // @{ Tuple tuple_select(const Tuple &condition, const Tuple &true_value, const Tuple &false_value); Tuple tuple_select(const Expr &condition, const Tuple &true_value, const Tuple &false_value); // @} /** Equivalent of multiway select(), but taking/returning tuples. If the condition is * a Tuple, it must match the size of the true and false Tuples. */ // @{ template inline Tuple tuple_select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&... args) { return tuple_select(c0, v0, tuple_select(c1, v1, std::forward(args)...)); } template inline Tuple tuple_select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&... args) { return tuple_select(c0, v0, tuple_select(c1, v1, std::forward(args)...)); } // @} /** Oftentimes we want to pack a list of expressions with the same type * into a channel dimension, e.g., * img(x, y, c) = select(c == 0, 100, // Red * c == 1, 50, // Green * 25); // Blue * This is tedious when the list is long. The following function * provide convinent syntax that allow one to write: * img(x, y, c) = mux(c, {100, 50, 25}); */ // @{ Expr mux(const Expr &id, const std::initializer_list &values); Expr mux(const Expr &id, const std::vector &values); Expr mux(const Expr &id, const Tuple &values); // @} /** Return the sine of a floating-point expression. If the argument is * not floating-point, it is cast to Float(32). Does not vectorize * well. */ Expr sin(Expr x); /** Return the arcsine of a floating-point expression. If the argument * is not floating-point, it is cast to Float(32). Does not vectorize * well. */ Expr asin(Expr x); /** Return the cosine of a floating-point expression. If the argument * is not floating-point, it is cast to Float(32). Does not vectorize * well. */ Expr cos(Expr x); /** Return the arccosine of a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). Does not * vectorize well. */ Expr acos(Expr x); /** Return the tangent of a floating-point expression. If the argument * is not floating-point, it is cast to Float(32). Does not vectorize * well. */ Expr tan(Expr x); /** Return the arctangent of a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). Does not * vectorize well. */ Expr atan(Expr x); /** Return the angle of a floating-point gradient. If the argument is * not floating-point, it is cast to Float(32). Does not vectorize * well. */ Expr atan2(Expr y, Expr x); /** Return the hyperbolic sine of a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). Does not * vectorize well. */ Expr sinh(Expr x); /** Return the hyperbolic arcsinhe of a floating-point expression. If * the argument is not floating-point, it is cast to Float(32). Does * not vectorize well. */ Expr asinh(Expr x); /** Return the hyperbolic cosine of a floating-point expression. If * the argument is not floating-point, it is cast to Float(32). Does * not vectorize well. */ Expr cosh(Expr x); /** Return the hyperbolic arccosine of a floating-point expression. * If the argument is not floating-point, it is cast to * Float(32). Does not vectorize well. */ Expr acosh(Expr x); /** Return the hyperbolic tangent of a floating-point expression. If * the argument is not floating-point, it is cast to Float(32). Does * not vectorize well. */ Expr tanh(Expr x); /** Return the hyperbolic arctangent of a floating-point expression. * If the argument is not floating-point, it is cast to * Float(32). Does not vectorize well. */ Expr atanh(Expr x); /** Return the square root of a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). Typically * vectorizes cleanly. */ Expr sqrt(Expr x); /** Return the square root of the sum of the squares of two * floating-point expressions. If the argument is not floating-point, * it is cast to Float(32). Vectorizes cleanly. */ Expr hypot(const Expr &x, const Expr &y); /** Return the exponential of a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). For * Float(64) arguments, this calls the system exp function, and does * not vectorize well. For Float(32) arguments, this function is * vectorizable, does the right thing for extremely small or extremely * large inputs, and is accurate up to the last bit of the * mantissa. Vectorizes cleanly. */ Expr exp(Expr x); /** Return the logarithm of a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). For * Float(64) arguments, this calls the system log function, and does * not vectorize well. For Float(32) arguments, this function is * vectorizable, does the right thing for inputs <= 0 (returns -inf or * nan), and is accurate up to the last bit of the * mantissa. Vectorizes cleanly. */ Expr log(Expr x); /** Return one floating point expression raised to the power of * another. The type of the result is given by the type of the first * argument. If the first argument is not a floating-point type, it is * cast to Float(32). For Float(32), cleanly vectorizable, and * accurate up to the last few bits of the mantissa. Gets worse when * approaching overflow. Vectorizes cleanly. */ Expr pow(Expr x, Expr y); /** Evaluate the error function erf. Only available for * Float(32). Accurate up to the last three bits of the * mantissa. Vectorizes cleanly. */ Expr erf(const Expr &x); /** Fast vectorizable approximation to some trigonometric functions for Float(32). * Absolute approximation error is less than 1e-5. */ // @{ Expr fast_sin(const Expr &x); Expr fast_cos(const Expr &x); // @} /** Fast approximate cleanly vectorizable log for Float(32). Returns * nonsense for x <= 0.0f. Accurate up to the last 5 bits of the * mantissa. Vectorizes cleanly. */ Expr fast_log(const Expr &x); /** Fast approximate cleanly vectorizable exp for Float(32). Returns * nonsense for inputs that would overflow or underflow. Typically * accurate up to the last 5 bits of the mantissa. Gets worse when * approaching overflow. Vectorizes cleanly. */ Expr fast_exp(const Expr &x); /** Fast approximate cleanly vectorizable pow for Float(32). Returns * nonsense for x < 0.0f. Accurate up to the last 5 bits of the * mantissa for typical exponents. Gets worse when approaching * overflow. Vectorizes cleanly. */ Expr fast_pow(Expr x, Expr y); /** Fast approximate inverse for Float(32). Corresponds to the rcpps * instruction on x86, and the vrecpe instruction on ARM. Vectorizes * cleanly. Note that this can produce slightly different results * across different implementations of the same architecture (e.g. AMD vs Intel), * even when strict_float is enabled. */ Expr fast_inverse(Expr x); /** Fast approximate inverse square root for Float(32). Corresponds to * the rsqrtps instruction on x86, and the vrsqrte instruction on * ARM. Vectorizes cleanly. Note that this can produce slightly different results * across different implementations of the same architecture (e.g. AMD vs Intel), * even when strict_float is enabled. */ Expr fast_inverse_sqrt(Expr x); /** Return the greatest whole number less than or equal to a * floating-point expression. If the argument is not floating-point, * it is cast to Float(32). The return value is still in floating * point, despite being a whole number. Vectorizes cleanly. */ Expr floor(Expr x); /** Return the least whole number greater than or equal to a * floating-point expression. If the argument is not floating-point, * it is cast to Float(32). The return value is still in floating * point, despite being a whole number. Vectorizes cleanly. */ Expr ceil(Expr x); /** Return the whole number closest to a floating-point expression. If the * argument is not floating-point, it is cast to Float(32). The return value * is still in floating point, despite being a whole number. On ties, we * follow IEEE754 conventions and round to the nearest even number. Vectorizes * cleanly. */ Expr round(Expr x); /** Return the integer part of a floating-point expression. If the argument is * not floating-point, it is cast to Float(32). The return value is still in * floating point, despite being a whole number. Vectorizes cleanly. */ Expr trunc(Expr x); /** Returns true if the argument is a Not a Number (NaN). Requires a * floating point argument. Vectorizes cleanly. * Note that the Expr passed in will be evaluated in strict_float mode, * regardless of whether strict_float mode is enabled in the current Target. */ Expr is_nan(Expr x); /** Returns true if the argument is Inf or -Inf. Requires a * floating point argument. Vectorizes cleanly. * Note that the Expr passed in will be evaluated in strict_float mode, * regardless of whether strict_float mode is enabled in the current Target. */ Expr is_inf(Expr x); /** Returns true if the argument is a finite value (ie, neither NaN nor Inf). * Requires a floating point argument. Vectorizes cleanly. * Note that the Expr passed in will be evaluated in strict_float mode, * regardless of whether strict_float mode is enabled in the current Target. */ Expr is_finite(Expr x); /** Return the fractional part of a floating-point expression. If the argument * is not floating-point, it is cast to Float(32). The return value has the * same sign as the original expression. Vectorizes cleanly. */ Expr fract(const Expr &x); /** Reinterpret the bits of one value as another type. */ Expr reinterpret(Type t, Expr e); template Expr reinterpret(Expr e) { return reinterpret(type_of(), e); } /** Return the bitwise and of two expressions (which need not have the * same type). The result type is the wider of the two expressions. * Only integral types are allowed and both expressions must be signed * or both must be unsigned. */ Expr operator&(Expr x, Expr y); /** Return the bitwise and of an expression and an integer. The type * of the result is the type of the expression argument. */ // @{ Expr operator&(Expr x, int y); Expr operator&(int x, Expr y); // @} /** Return the bitwise or of two expressions (which need not have the * same type). The result type is the wider of the two expressions. * Only integral types are allowed and both expressions must be signed * or both must be unsigned. */ Expr operator|(Expr x, Expr y); /** Return the bitwise or of an expression and an integer. The type of * the result is the type of the expression argument. */ // @{ Expr operator|(Expr x, int y); Expr operator|(int x, Expr y); // @} /** Return the bitwise xor of two expressions (which need not have the * same type). The result type is the wider of the two expressions. * Only integral types are allowed and both expressions must be signed * or both must be unsigned. */ Expr operator^(Expr x, Expr y); /** Return the bitwise xor of an expression and an integer. The type * of the result is the type of the expression argument. */ // @{ Expr operator^(Expr x, int y); Expr operator^(int x, Expr y); // @} /** Return the bitwise not of an expression. */ Expr operator~(Expr x); /** Shift the bits of an integer value left. This is actually less * efficient than multiplying by 2^n, because Halide's optimization * passes understand multiplication, and will compile it to * shifting. This operator is only for if you really really need bit * shifting (e.g. because the exponent is a run-time parameter). The * type of the result is equal to the type of the first argument. Both * arguments must have integer type. */ // @{ Expr operator<<(Expr x, Expr y); Expr operator<<(Expr x, int y); // @} /** Shift the bits of an integer value right. Does sign extension for * signed integers. This is less efficient than dividing by a power of * two. Halide's definition of division (always round to negative * infinity) means that all divisions by powers of two get compiled to * bit-shifting, and Halide's optimization routines understand * division and can work with it. The type of the result is equal to * the type of the first argument. Both arguments must have integer * type. */ // @{ Expr operator>>(Expr x, Expr y); Expr operator>>(Expr x, int y); // @} /** Linear interpolate between the two values according to a weight. * \param zero_val The result when weight is 0 * \param one_val The result when weight is 1 * \param weight The interpolation amount * * Both zero_val and one_val must have the same type. All types are * supported, including bool. * * The weight is treated as its own type and must be float or an * unsigned integer type. It is scaled to the bit-size of the type of * x and y if they are integer, or converted to float if they are * float. Integer weights are converted to float via division by the * full-range value of the weight's type. Floating-point weights used * to interpolate between integer values must be between 0.0f and * 1.0f, and an error may be signaled if it is not provably so. (clamp * operators can be added to provide proof. Currently an error is only * signalled for constant weights.) * * For integer linear interpolation, out of range values cannot be * represented. In particular, weights that are conceptually less than * 0 or greater than 1.0 are not representable. As such the result is * always between x and y (inclusive of course). For lerp with * floating-point values and floating-point weight, the full range of * a float is valid, however underflow and overflow can still occur. * * Ordering is not required between zero_val and one_val: * lerp(42, 69, .5f) == lerp(69, 42, .5f) == 56 * * Results for integer types are for exactly rounded arithmetic. As * such, there are cases where 16-bit and float differ because 32-bit * floating-point (float) does not have enough precision to produce * the exact result. (Likely true for 32-bit integer * vs. double-precision floating-point as well.) * * At present, double precision and 64-bit integers are not supported. * * Generally, lerp will vectorize as if it were an operation on a type * twice the bit size of the inferred type for x and y. * * Some examples: * \code * * // Since Halide does not have direct type delcarations, casts * // below are used to indicate the types of the parameters. * // Such casts not required or expected in actual code where types * // are inferred. * * lerp(cast(x), cast(y), cast(w)) -> * x * (1.0f - w) + y * w * * lerp(cast(x), cast(y), cast(w)) -> * cast(cast(x) * (1.0f - cast(w) / 255.0f) + * cast(y) * cast(w) / 255.0f + .5f) * * // Note addition in Halide promoted uint8_t + int8_t to int16_t already, * // the outer cast is added for clarity. * lerp(cast(x), cast(y), cast(w)) -> * cast(cast(x) * (1.0f - cast(w) / 255.0f) + * cast(y) * cast(w) / 255.0f + .5f) * * lerp(cast(x), cast(y), cast(w)) -> * cast(cast(x) * (1.0f - cast(w)) + * cast(y) * cast(w)) * * \endcode * */ Expr lerp(Expr zero_val, Expr one_val, Expr weight); /** Count the number of set bits in an expression. */ Expr popcount(Expr x); /** Count the number of leading zero bits in an expression. If the expression is * zero, the result is the number of bits in the type. */ Expr count_leading_zeros(Expr x); /** Count the number of trailing zero bits in an expression. If the expression is * zero, the result is the number of bits in the type. */ Expr count_trailing_zeros(Expr x); /** Divide two integers, rounding towards zero. This is the typical * behavior of most hardware architectures, which differs from * Halide's division operator, which is Euclidean (rounds towards * -infinity). Will throw a runtime error if y is zero, or if y is -1 * and x is the minimum signed integer. */ Expr div_round_to_zero(Expr x, Expr y); /** Compute the remainder of dividing two integers, when division is * rounding toward zero. This is the typical behavior of most hardware * architectures, which differs from Halide's mod operator, which is * Euclidean (produces the remainder when division rounds towards * -infinity). Will throw a runtime error if y is zero. */ Expr mod_round_to_zero(Expr x, Expr y); /** Return a random variable representing a uniformly distributed * float in the half-open interval [0.0f, 1.0f). For random numbers of * other types, use lerp with a random float as the last parameter. * * Optionally takes a seed. * * Note that: \code Expr x = random_float(); Expr y = x + x; \endcode * * is very different to * \code Expr y = random_float() + random_float(); \endcode * * The first doubles a random variable, and the second adds two * independent random variables. * * A given random variable takes on a unique value that depends * deterministically on the pure variables of the function they belong * to, the identity of the function itself, and which definition of * the function it is used in. They are, however, shared across tuple * elements. * * This function vectorizes cleanly. */ Expr random_float(Expr seed = Expr()); /** Return a random variable representing a uniformly distributed * unsigned 32-bit integer. See \ref random_float. Vectorizes cleanly. */ Expr random_uint(Expr seed = Expr()); /** Return a random variable representing a uniformly distributed * 32-bit integer. See \ref random_float. Vectorizes cleanly. */ Expr random_int(Expr seed = Expr()); /** Create an Expr that prints out its value whenever it is * evaluated. It also prints out everything else in the arguments * list, separated by spaces. This can include string literals. */ //@{ Expr print(const std::vector &values); template inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&... args) { std::vector collected_args = {std::move(a)}; Internal::collect_print_args(collected_args, std::forward(args)...); return print(collected_args); } //@} /** Create an Expr that prints whenever it is evaluated, provided that * the condition is true. */ // @{ Expr print_when(Expr condition, const std::vector &values); template inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &&... args) { std::vector collected_args = {std::move(a)}; Internal::collect_print_args(collected_args, std::forward(args)...); return print_when(std::move(condition), collected_args); } // @} /** Create an Expr that that guarantees a precondition. * If 'condition' is true, the return value is equal to the first Expr. * If 'condition' is false, halide_error() is called, and the return value * is arbitrary. Any additional arguments after the first Expr are stringified * and passed as a user-facing message to halide_error(), similar to print(). * * Note that this essentially *always* inserts a runtime check into the * generated code (except when the condition can be proven at compile time); * as such, it should be avoided inside inner loops, except for debugging * or testing purposes. Note also that it does not vectorize cleanly (vector * values will be scalarized for the check). * * However, using this to make assertions about (say) input values * can be useful, both in terms of correctness and (potentially) in terms * of code generation, e.g. \code Param p; Expr y = require(p > 0, p); \endcode * will allow the optimizer to assume positive, nonzero values for y. */ // @{ Expr require(Expr condition, const std::vector &values); template inline HALIDE_NO_USER_CODE_INLINE Expr require(Expr condition, Expr value, Args &&... args) { std::vector collected_args = {std::move(value)}; Internal::collect_print_args(collected_args, std::forward(args)...); return require(std::move(condition), collected_args); } // @} /** Return an undef value of the given type. Halide skips stores that * depend on undef values, so you can use this to mean "do not modify * this memory location". This is an escape hatch that can be used for * several things: * * You can define a reduction with no pure step, by setting the pure * step to undef. Do this only if you're confident that the update * steps are sufficient to correctly fill in the domain. * * For a tuple-valued reduction, you can write an update step that * only updates some tuple elements. * * You can define single-stage pipeline that only has update steps, * and depends on the values already in the output buffer. * * Use this feature with great caution, as you can use it to load from * uninitialized memory. */ Expr undef(Type t); template inline Expr undef() { return undef(type_of()); } /** Control the values used in the memoization cache key for memoize. * Normally parameters and other external dependencies are * automatically inferred and added to the cache key. The memoize_tag * operator allows computing one expression and using either the * computed value, or one or more other expressions in the cache key * instead of the parameter dependencies of the computation. The * single argument version is completely safe in that the cache key * will use the actual computed value -- it is difficult or imposible * to produce erroneous caching this way. The more-than-one argument * version allows generating cache keys that do not uniquely identify * the computation and thus can result in caching errors. * * A potential use for the single argument version is to handle a * floating-point parameter that is quantized to a small * integer. Mutliple values of the float will produce the same integer * and moving the caching to using the integer for the key is more * efficient. * * The main use for the more-than-one argument version is to provide * cache key information for Handles and ImageParams, which otherwise * are not allowed inside compute_cached operations. E.g. when passing * a group of parameters to an external array function via a Handle, * memoize_tag can be used to isolate the actual values used by that * computation. If an ImageParam is a constant image with a persistent * digest, memoize_tag can be used to key computations using that image * on the digest. */ // @{ template inline HALIDE_NO_USER_CODE_INLINE Expr memoize_tag(Expr result, Args &&... args) { std::vector collected_args{std::forward(args)...}; return Internal::memoize_tag_helper(std::move(result), collected_args); } // @} /** Expressions tagged with this intrinsic are considered to be part * of the steady state of some loop with a nasty beginning and end * (e.g. a boundary condition). When Halide encounters likely * intrinsics, it splits the containing loop body into three, and * tries to simplify down all conditions that lead to the likely. For * example, given the expression: select(x < 1, bar, x > 10, bar, * likely(foo)), Halide will split the loop over x into portions where * x < 1, 1 <= x <= 10, and x > 10. * * You're unlikely to want to call this directly. You probably want to * use the boundary condition helpers in the BoundaryConditions * namespace instead. */ Expr likely(Expr e); /** Equivalent to likely, but only triggers a loop partitioning if * found in an innermost loop. */ Expr likely_if_innermost(Expr e); /** Cast an expression to the halide type corresponding to the C++ * type T. As part of the cast, clamp to the minimum and maximum * values of the result type. */ template Expr saturating_cast(Expr e) { return saturating_cast(type_of(), std::move(e)); } /** Cast an expression to a new type, clamping to the minimum and * maximum values of the result type. */ Expr saturating_cast(Type t, Expr e); /** Makes a best effort attempt to preserve IEEE floating-point * semantics in evaluating an expression. May not be implemented for * all backends. (E.g. it is difficult to do this for C++ code * generation as it depends on the compiler flags used to compile the * generated code. */ Expr strict_float(Expr e); /** Create an Expr that that promises another Expr is clamped but do * not generate code to check the assertion or modify the value. No * attempt is made to prove the bound at compile time. (If it is * proved false as a result of something else, an error might be * generated, but it is also possible the compiler will crash.) The * promised bound is used in bounds inference so it will allow * satisfying bounds checks as well as possibly aiding optimization. * * unsafe_promise_clamped returns its first argument, the Expr 'value' * * This is a very easy way to make Halide generate erroneous code if * the bound promises is not kept. Use sparingly when there is no * other way to convey the information to the compiler and it is * required for a valuable optimization. * * Unsafe promises can be checked by turning on * Target::CheckUnsafePromises. This is intended for debugging only. */ Expr unsafe_promise_clamped(const Expr &value, const Expr &min, const Expr &max); namespace Internal { /** * FOR INTERNAL USE ONLY. * * An entirely unchecked version of unsafe_promise_clamped, used * inside the compiler as an annotation of the known bounds of an Expr * when it has proved something is bounded and wants to record that * fact for later passes (notably bounds inference) to exploit. This * gets introduced by GuardWithIf tail strategies, because the bounds * machinery has a hard time exploiting if statement conditions. * * Unlike unsafe_promise_clamped, this expression is * context-dependent, because 'value' might be statically bounded at * some point in the IR (e.g. due to a containing if statement), but * not elsewhere. **/ Expr promise_clamped(const Expr &value, const Expr &min, const Expr &max); } // namespace Internal } // namespace Halide #endif #include #include #include namespace Halide { namespace Internal { /** * Represent an associative op with its identity. The op may be multi-dimensional, * e.g. complex multiplication. 'is_commutative' is set to true if the op is also * commutative in addition to being associative. * * For example, complex multiplication is represented as: \code AssociativePattern pattern( {x0 * y0 - x1 * y1, x1 * y0 + x0 * y1}, {one, zero}, true ); \endcode */ struct AssociativePattern { /** Contain the binary operators for each dimension of the associative op. */ std::vector ops; /** Contain the identities for each dimension of the associative op. */ std::vector identities; /** Indicate if the associative op is also commutative. */ bool is_commutative; AssociativePattern() : is_commutative(false) { } AssociativePattern(size_t size) : ops(size), identities(size), is_commutative(false) { } AssociativePattern(const std::vector &ops, const std::vector &ids, bool is_commutative) : ops(ops), identities(ids), is_commutative(is_commutative) { } AssociativePattern(Expr op, Expr id, bool is_commutative) : ops({std::move(op)}), identities({std::move(id)}), is_commutative(is_commutative) { } bool operator==(const AssociativePattern &other) const { if ((is_commutative != other.is_commutative) || (ops.size() != other.ops.size())) { return false; } for (size_t i = 0; i < size(); ++i) { if (!equal(ops[i], other.ops[i]) || !equal(identities[i], other.identities[i])) { return false; } } return true; } bool operator!=(const AssociativePattern &other) const { return !(*this == other); } size_t size() const { return ops.size(); } bool commutative() const { return is_commutative; } }; const std::vector &get_ops_table(const std::vector &exprs); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_ASSOCIATIVITY_H #define HALIDE_ASSOCIATIVITY_H /** \file * * Methods for extracting an associative operator from a Func's update definition * if there is any and computing the identity of the associative operator. */ #include #include #include #include namespace Halide { namespace Internal { /** * Represent the equivalent associative op of an update definition. * For example, the following associative Expr, min(f(x), g(r.x) + 2), * where f(x) is the self-recurrence term, is represented as: \code AssociativeOp assoc( AssociativePattern(min(x, y), +inf, true), {Replacement("x", f(x))}, {Replacement("y", g(r.x) + 2)}, true ); \endcode * * 'pattern' contains the list of equivalent binary/unary operators (+ identities) * for each Tuple element in the update definition. 'pattern' also contains * a boolean that indicates if the op is also commutative. 'xs' and 'ys' * contain the corresponding definition of each variable in the list of * binary operators. * * For unary operator, 'xs' is not set, i.e. it will be a pair of empty string * and undefined Expr: {"", Expr()}. 'pattern' will only contain the 'y' term in * this case. For example, min(g(r.x), 4), will be represented as: \code AssociativeOp assoc( AssociativePattern(y, 0, false), {Replacement("", Expr())}, {Replacement("y", min(g(r.x), 4))}, true ); \endcode * * Self-assignment, f(x) = f(x), will be represented as: \code AssociativeOp assoc( AssociativePattern(x, 0, true), {Replacement("x", f(x))}, {Replacement("", Expr())}, true ); \endcode * For both unary operator and self-assignment cases, the identity does not * matter. It can be anything. */ struct AssociativeOp { struct Replacement { /** Variable name that is used to replace the expr in 'op'. */ std::string var; Expr expr; Replacement() = default; Replacement(const std::string &var, Expr expr) : var(var), expr(std::move(expr)) { } bool operator==(const Replacement &other) const { return (var == other.var) && equal(expr, other.expr); } bool operator!=(const Replacement &other) const { return !(*this == other); } }; /** List of pairs of binary associative op and its identity. */ AssociativePattern pattern; std::vector xs; std::vector ys; bool is_associative; AssociativeOp() : is_associative(false) { } AssociativeOp(size_t size) : pattern(size), xs(size), ys(size), is_associative(false) { } AssociativeOp(const AssociativePattern &p, const std::vector &xs, const std::vector &ys, bool is_associative) : pattern(p), xs(xs), ys(ys), is_associative(is_associative) { } bool associative() const { return is_associative; } bool commutative() const { return pattern.is_commutative; } size_t size() const { return pattern.size(); } }; /** * Given an update definition of a Func 'f', determine its equivalent * associative binary/unary operator if there is any. 'is_associative' * indicates if the operation was successfuly proven as associative. */ AssociativeOp prove_associativity( const std::string &f, std::vector args, std::vector exprs); void associativity_test(); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_ASYNC_PRODUCERS_H #define HALIDE_ASYNC_PRODUCERS_H /** \file * Defines the lowering pass that injects task parallelism for producers that are scheduled as async. */ #include #include namespace Halide { namespace Internal { class Function; Stmt fork_async_producers(Stmt s, const std::map &env); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_INTERNAL_AUTO_SCHEDULE_H #define HALIDE_INTERNAL_AUTO_SCHEDULE_H /** \file * * Defines the method that does automatic scheduling of Funcs within a pipeline. */ #ifndef HALIDE_PIPELINE_H #define HALIDE_PIPELINE_H /** \file * * Defines the front-end class representing an entire Halide imaging * pipeline. */ #include #include #ifndef HALIDE_EXTERNAL_CODE_H #define HALIDE_EXTERNAL_CODE_H #include namespace Halide { class ExternalCode { private: enum Kind { LLVMBitcode, DeviceCode, CPlusPlusSource, } kind; Target llvm_target; // For LLVMBitcode. DeviceAPI device_code_kind; std::vector code; // Used for debugging and naming the module to llvm. std::string nametag; ExternalCode(Kind kind, const Target &llvm_target, DeviceAPI device_api, const std::vector &code, const std::string &name) : kind(kind), llvm_target(llvm_target), device_code_kind(device_api), code(code), nametag(name) { } public: /** Construct an ExternalCode container from llvm bitcode. The * result can be passed to Halide::Module::append to have the * contained bitcode linked with that module. The Module's target * must match the target argument here on architecture, bit width, * and operating system. The name is used as a unique identifier * for the external code and duplicates will be reduced to a * single instance. Halide does not do anything other than to * compare names for equality. To guarantee uniqueness in public * code, we suggest using a Java style inverted domain name * followed by organization specific naming. E.g.: * com.initech.y2k.5d2ac80aaf522eec6cb4b40f39fb923f9902bc7e */ static ExternalCode bitcode_wrapper(const Target &cpu_type, const std::vector &code, const std::string &name) { return ExternalCode(LLVMBitcode, cpu_type, DeviceAPI::None, code, name); } /** Construct an ExternalCode container from GPU "source code." * This container can be used to insert its code into the GPU code * generated for a given DeviceAPI. The specific type of code * depends on the device API used as follows: * CUDA: llvm bitcode for PTX * OpenCL: OpenCL source code * GLSL: GLSL source code * OpenGLCompute: GLSL source code * Metal: Metal source code * Hexagon: llvm bitcode for Hexagon * * At present, this API is not fully working. See Issue: * https://github.com/halide/Halide/issues/1971 * * The name is used as a unique identifier for the external code * and duplicates will be reduced to a single instance. Halide * does not do anything other than to compare names for * equality. To guarantee uniqueness in public code, we suggest * using a Java style inverted domain name followed by * organization specific naming. E.g.: * com.tyrell.nexus-6.53947db86ba97a9ca5ecd5e60052880945bfeb37 */ static ExternalCode device_code_wrapper(DeviceAPI device_api, const std::vector &code, const std::string &name) { return ExternalCode(DeviceCode, Target(), device_api, code, name); } /** Construct an ExternalCode container from C++ source code. This * container can be used to insert its code into C++ output from * Halide. * * At present, this API is not fully working. See Issue: * https://github.com/halide/Halide/issues/1971 * * The name is used as a unique identifier for the external code * and duplicates will be reduced to a single instance. Halide * does not do anything other than to compare names for * equality. To guarantee uniqueness in public code, we suggest * using a Java style inverted domain name followed by * organization specific naming. E.g.: * com.cyberdyne.skynet.78ad6c411d313f050f172cd3d440f23fdd797d0d */ static ExternalCode c_plus_plus_code_wrapper(const std::vector &code, const std::string &name) { return ExternalCode(CPlusPlusSource, Target(), DeviceAPI::None, code, name); } /** Return true if this container holds llvm bitcode linkable with * code generated for the target argument. The matching is done * on the architecture, bit width, and operating system * only. Features are ignored. If the container is for * Target::ArchUnkonwn, it applies to all architectures -- meaning * it is generic llvm bitcode. If the OS is OSUnknown, it applies * to all operationg systems. The bit width must match. * * Ignoring feature flags isn't too important since generally * ExternalCode will be constructed in a Generator which has * access to the feature flags in effect and can select code * appropriately. */ bool is_for_cpu_target(const Target &host) const { return kind == LLVMBitcode && (llvm_target.arch == Target::ArchUnknown || llvm_target.arch == host.arch) && (llvm_target.os == Target::OSUnknown || llvm_target.os == host.os) && (llvm_target.bits == host.bits); } /** True if this container holds code linkable with a code generated for a GPU. */ bool is_for_device_api(DeviceAPI current_device) const { return kind == DeviceCode && device_code_kind == current_device; } /** True if this container holds C++ source code for inclusion in * generating C++ output. */ bool is_c_plus_plus_source() const { return kind == CPlusPlusSource; } /** Retrieve the bytes of external code held by this container. */ const std::vector &contents() const { return code; } /** Retrieve the name of this container. Used to ensure the same * piece of external code is only included once in linkage. */ const std::string &name() const { return nametag; } }; } // namespace Halide #endif #ifndef HALIDE_JIT_MODULE_H #define HALIDE_JIT_MODULE_H /** \file * Defines the struct representing lifetime and dependencies of * a JIT compiled halide pipeline */ #include #include namespace llvm { class Module; } namespace Halide { struct ExternCFunction; struct JITExtern; struct Target; class Module; namespace Internal { class JITModuleContents; struct LoweredFunc; struct JITModule { IntrusivePtr jit_module; struct Symbol { void *address = nullptr; Symbol() = default; explicit Symbol(void *address) : address(address) { } }; JITModule(); JITModule(const Module &m, const LoweredFunc &fn, const std::vector &dependencies = std::vector()); /** Take a list of JITExterns and generate trampoline functions * which can be called dynamically via a function pointer that * takes an array of void *'s for each argument and the return * value. */ static JITModule make_trampolines_module(const Target &target, const std::map &externs, const std::string &suffix, const std::vector &deps); /** The exports map of a JITModule contains all symbols which are * available to other JITModules which depend on this one. For * runtime modules, this is all of the symbols exported from the * runtime. For a JITted Func, it generally only contains the main * result Func of the compilation, which takes its name directly * from the Func declaration. One can also make a module which * contains no code itself but is just an exports maps providing * arbitrary pointers to functions or global variables to JITted * code. */ const std::map &exports() const; /** A pointer to the raw halide function. Its true type depends * on the Argument vector passed to CodeGen_LLVM::compile. Image * parameters become (halide_buffer_t *), and scalar parameters become * pointers to the appropriate values. The final argument is a * pointer to the halide_buffer_t defining the output. This will be nullptr for * a JITModule which has not yet been compiled or one that is not * a Halide Func compilation at all. */ void *main_function() const; /** Returns the Symbol structure for the routine documented in * main_function. Returning a Symbol allows access to the LLVM * type as well as the address. The address and type will be nullptr * if the module has not been compiled. */ Symbol entrypoint_symbol() const; /** Returns the Symbol structure for the argv wrapper routine * corresponding to the entrypoint. The argv wrapper is callable * via an array of void * pointers to the arguments for the * call. Returning a Symbol allows access to the LLVM type as well * as the address. The address and type will be nullptr if the module * has not been compiled. */ Symbol argv_entrypoint_symbol() const; /** A slightly more type-safe wrapper around the raw halide * module. Takes it arguments as an array of pointers that * correspond to the arguments to \ref main_function . This will * be nullptr for a JITModule which has not yet been compiled or one * that is not a Halide Func compilation at all. */ // @{ typedef int (*argv_wrapper)(const void **args); argv_wrapper argv_function() const; // @} /** Add another JITModule to the dependency chain. Dependencies * are searched to resolve symbols not found in the current * compilation unit while JITting. */ void add_dependency(JITModule &dep); /** Registers a single Symbol as available to modules which depend * on this one. The Symbol structure provides both the address and * the LLVM type for the function, which allows type safe linkage of * extenal routines. */ void add_symbol_for_export(const std::string &name, const Symbol &extern_symbol); /** Registers a single function as available to modules which * depend on this one. This routine converts the ExternSignature * info into an LLVM type, which allows type safe linkage of * external routines. */ void add_extern_for_export(const std::string &name, const ExternCFunction &extern_c_function); /** Look up a symbol by name in this module or its dependencies. */ Symbol find_symbol_by_name(const std::string &) const; /** Take an llvm module and compile it. The requested exports will be available via the exports method. */ void compile_module(std::unique_ptr mod, const std::string &function_name, const Target &target, const std::vector &dependencies = std::vector(), const std::vector &requested_exports = std::vector()); /** See JITSharedRuntime::memoization_cache_set_size */ void memoization_cache_set_size(int64_t size) const; /** See JITSharedRuntime::reuse_device_allocations */ void reuse_device_allocations(bool) const; /** Return true if compile_module has been called on this module. */ bool compiled() const; }; typedef int (*halide_task)(void *user_context, int, uint8_t *); struct JITHandlers { void (*custom_print)(void *, const char *){nullptr}; void *(*custom_malloc)(void *, size_t){nullptr}; void (*custom_free)(void *, void *){nullptr}; int (*custom_do_task)(void *, halide_task, int, uint8_t *){nullptr}; int (*custom_do_par_for)(void *, halide_task, int, int, uint8_t *){nullptr}; void (*custom_error)(void *, const char *){nullptr}; int32_t (*custom_trace)(void *, const halide_trace_event_t *){nullptr}; void *(*custom_get_symbol)(const char *name){nullptr}; void *(*custom_load_library)(const char *name){nullptr}; void *(*custom_get_library_symbol)(void *lib, const char *name){nullptr}; }; struct JITUserContext { void *user_context; JITHandlers handlers; }; class JITSharedRuntime { public: // Note only the first llvm::Module passed in here is used. The same shared runtime is used for all JIT. static std::vector get(llvm::Module *m, const Target &target, bool create = true); static void init_jit_user_context(JITUserContext &jit_user_context, void *user_context, const JITHandlers &handlers); static JITHandlers set_default_handlers(const JITHandlers &handlers); /** Set the maximum number of bytes used by memoization caching. * If you are compiling statically, you should include HalideRuntime.h * and call halide_memoization_cache_set_size() instead. */ static void memoization_cache_set_size(int64_t size); /** Set whether or not Halide may hold onto and reuse device * allocations to avoid calling expensive device API allocation * functions. If you are compiling statically, you should include * HalideRuntime.h and call halide_reuse_device_allocations * instead. */ static void reuse_device_allocations(bool); static void release_all(); }; void *get_symbol_address(const char *s); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_MODULE_H #define HALIDE_MODULE_H /** \file * * Defines Module, an IR container that fully describes a Halide program. */ #include #include #include #include #ifndef HALIDE_FUNCTION_H #define HALIDE_FUNCTION_H /** \file * Defines the internal representation of a halide function and related classes */ #include #include #include #include #ifndef HALIDE_DEFINITION_H #define HALIDE_DEFINITION_H /** \file * Defines the internal representation of a halide function's definition and related classes */ #include namespace Halide { namespace Internal { struct DefinitionContents; struct FunctionContents; class ReductionDomain; } // namespace Internal namespace Internal { class IRVisitor; class IRMutator; struct Specialization; /** A Function definition which can either represent a init or an update * definition. A function may have different definitions due to specialization, * which are stored in 'specializations' (Not possible from the front-end, but * some scheduling directives may potentially cause this divergence to occur). * Although init definition may have multiple values (RHS) per specialization, it * must have the same LHS (i.e. same pure dimension variables). The update * definition, on the other hand, may have different LHS/RHS per specialization. * Note that, while the Expr in LHS/RHS may be different across specializations, * they must have the same number of dimensions and the same pure dimensions. */ class Definition { IntrusivePtr contents; public: /** Construct a Definition from an existing DefinitionContents pointer. Must be non-null */ explicit Definition(const IntrusivePtr &); /** Construct a Definition with the supplied args, values, and reduction domain. */ Definition(const std::vector &args, const std::vector &values, const ReductionDomain &rdom, bool is_init); /** Construct an undefined Definition object. */ Definition(); /** Return a copy of this Definition. */ Definition get_copy() const; /** Equality of identity */ bool same_as(const Definition &other) const { return contents.same_as(other.contents); } /** Definition objects are nullable. Does this definition exist? */ bool defined() const; /** Is this an init definition; otherwise it's an update definition */ bool is_init() const; /** Pass an IRVisitor through to all Exprs referenced in the * definition. */ void accept(IRVisitor *) const; /** Pass an IRMutator through to all Exprs referenced in the * definition. */ void mutate(IRMutator *); /** Get the default (no-specialization) arguments (left-hand-side) of the definition */ // @{ const std::vector &args() const; std::vector &args(); // @} /** Get the default (no-specialization) right-hand-side of the definition */ // @{ const std::vector &values() const; std::vector &values(); // @} /** Get the predicate on the definition */ // @{ const Expr &predicate() const; Expr &predicate(); // @} /** Split predicate into vector of ANDs. If there is no predicate (i.e. this * definition is always valid), this returns an empty vector. */ std::vector split_predicate() const; /** Get the default (no-specialization) stage-specific schedule associated * with this definition. */ // @{ const StageSchedule &schedule() const; StageSchedule &schedule(); // @} /** You may create several specialized versions of a func with * different stage-specific schedules. They trigger when the condition is * true. See \ref Func::specialize */ // @{ const std::vector &specializations() const; std::vector &specializations(); const Specialization &add_specialization(Expr condition); // @} /** Attempt to get the source file and line where this definition * was made using DWARF introspection. Returns an empty string if * no debug symbols were found or the debug symbols were not * understood. Works on OS X and Linux only. */ std::string source_location() const; }; struct Specialization { Expr condition; Definition definition; std::string failure_message; // If non-empty, this specialization always assert-fails with this message. }; } // namespace Internal } // namespace Halide #endif namespace Halide { struct ExternFuncArgument; class Var; /** An enum to specify calling convention for extern stages. */ enum class NameMangling { Default, ///< Match whatever is specified in the Target C, ///< No name mangling CPlusPlus, ///< C++ name mangling }; namespace Internal { struct Call; class Parameter; /** A reference-counted handle to Halide's internal representation of * a function. Similar to a front-end Func object, but with no * syntactic sugar to help with definitions. */ class Function { FunctionPtr contents; public: /** This lets you use a Function as a key in a map of the form * map */ struct Compare { bool operator()(const Function &a, const Function &b) const { internal_assert(a.contents.defined() && b.contents.defined()); return a.contents < b.contents; } }; /** Construct a new function with no definitions and no name. This * constructor only exists so that you can make vectors of * functions, etc. */ Function(); /** Construct a new function with the given name */ explicit Function(const std::string &n); /** Construct a Function from an existing FunctionContents pointer. Must be non-null */ explicit Function(const FunctionPtr &); /** Get a handle on the halide function contents that this Function * represents. */ FunctionPtr get_contents() const { return contents; } /** Deep copy this Function into 'copy'. It recursively deep copies all called * functions, schedules, update definitions, extern func arguments, specializations, * and reduction domains. This method does not deep-copy the Parameter objects. * This method also takes a map of as input * and would use the deep-copied Function from the map if exists instead of * creating a new deep-copy to avoid creating deep-copies of the same Function * multiple times. If 'name' is specified, copy's name will be set to that. */ // @{ void deep_copy(const FunctionPtr ©, std::map &copied_map) const; void deep_copy(std::string name, const FunctionPtr ©, std::map &copied_map) const; // @} /** Add a pure definition to this function. It may not already * have a definition. All the free variables in 'value' must * appear in the args list. 'value' must not depend on any * reduction domain */ void define(const std::vector &args, std::vector values); /** Add an update definition to this function. It must already * have a pure definition but not an update definition, and the * length of args must match the length of args used in the pure * definition. 'value' must depend on some reduction domain, and * may contain variables from that domain as well as pure * variables. Any pure variables must also appear as Variables in * the args array, and they must have the same name as the pure * definition's argument in the same index. */ void define_update(const std::vector &args, std::vector values); /** Accept a visitor to visit all of the definitions and arguments * of this function. */ void accept(IRVisitor *visitor) const; /** Accept a mutator to mutator all of the definitions and * arguments of this function. */ void mutate(IRMutator *mutator); /** Get the name of the function. */ const std::string &name() const; /** If this is a wrapper of another func, created by a chain of in * or clone_in calls, returns the name of the original * Func. Otherwise returns the name. */ const std::string &origin_name() const; /** Get a mutable handle to the init definition. */ Definition &definition(); /** Get the init definition. */ const Definition &definition() const; /** Get the pure arguments. */ const std::vector &args() const; /** Get the dimensionality. */ int dimensions() const; /** Get the number of outputs. */ int outputs() const { return (int)output_types().size(); } /** Get the types of the outputs. */ const std::vector &output_types() const; /** Get the right-hand-side of the pure definition. Returns an * empty vector if there is no pure definition. */ const std::vector &values() const; /** Does this function have a pure definition? */ bool has_pure_definition() const; /** Does this function *only* have a pure definition? */ bool is_pure() const { return (has_pure_definition() && !has_update_definition() && !has_extern_definition()); } /** Is it legal to inline this function? */ bool can_be_inlined() const; /** Get a handle to the function-specific schedule for the purpose * of modifying it. */ FuncSchedule &schedule(); /** Get a const handle to the function-specific schedule for inspecting it. */ const FuncSchedule &schedule() const; /** Get a handle on the output buffer used for setting constraints * on it. */ const std::vector &output_buffers() const; /** Get a mutable handle to the stage-specfic schedule for the update * stage. */ StageSchedule &update_schedule(int idx = 0); /** Get a mutable handle to this function's update definition at * index 'idx'. */ Definition &update(int idx = 0); /** Get a const reference to this function's update definition at * index 'idx'. */ const Definition &update(int idx = 0) const; /** Get a const reference to this function's update definitions. */ const std::vector &updates() const; /** Does this function have an update definition? */ bool has_update_definition() const; /** Check if the function has an extern definition. */ bool has_extern_definition() const; /** Get the name mangling specified for the extern definition. */ NameMangling extern_definition_name_mangling() const; /** Make a call node to the extern definition. An error if the * function has no extern definition. */ Expr make_call_to_extern_definition(const std::vector &args, const Target &t) const; /** Get the proxy Expr for the extern stage. This is an expression * known to have the same data access pattern as the extern * stage. It must touch at least all of the memory that the extern * stage does, though it is permissible for it to be conservative * and touch a superset. For most Functions, including those with * extern definitions, this will be an undefined Expr. */ // @{ Expr extern_definition_proxy_expr() const; Expr &extern_definition_proxy_expr(); // @} /** Add an external definition of this Func. */ void define_extern(const std::string &function_name, const std::vector &args, const std::vector &types, const std::vector&dims, NameMangling mangling, DeviceAPI device_api); /** Retrive the arguments of the extern definition. */ // @{ const std::vector &extern_arguments() const; std::vector &extern_arguments(); // @} /** Get the name of the extern function called for an extern * definition. */ const std::string &extern_function_name() const; /** Get the DeviceAPI declared for an extern function. */ DeviceAPI extern_function_device_api() const; /** Test for equality of identity. */ bool same_as(const Function &other) const { return contents.same_as(other.contents); } /** Get a const handle to the debug filename. */ const std::string &debug_file() const; /** Get a handle to the debug filename. */ std::string &debug_file(); /** Use an an extern argument to another function. */ operator ExternFuncArgument() const; /** Tracing calls and accessors, passed down from the Func * equivalents. */ // @{ void trace_loads(); void trace_stores(); void trace_realizations(); void add_trace_tag(const std::string &trace_tag); bool is_tracing_loads() const; bool is_tracing_stores() const; bool is_tracing_realizations() const; const std::vector &get_trace_tags() const; // @} /** Replace this Function's LoopLevels with locked copies that * cannot be mutated further. */ void lock_loop_levels(); /** Mark function as frozen, which means it cannot accept new * definitions. */ void freeze(); /** Check if a function has been frozen. If so, it is an error to * add new definitions. */ bool frozen() const; /** Make a new Function with the same lifetime as this one, and * return a strong reference to it. Useful to create Functions which * have circular references to this one - e.g. the wrappers * produced by Func::in. */ Function new_function_in_same_group(const std::string &); /** Mark calls of this function by 'f' to be replaced with its wrapper * during the lowering stage. If the string 'f' is empty, it means replace * all calls to this function by all other functions (excluding itself) in * the pipeline with the wrapper. This will also freeze 'wrapper' to prevent * user from updating the values of the Function it wraps via the wrapper. * See \ref Func::in for more details. */ // @{ void add_wrapper(const std::string &f, Function &wrapper); const std::map &wrappers() const; // @} /** Check if a Function is a trivial wrapper around another * Function, Buffer, or Parameter. Returns the Call node if it * is. Otherwise returns null. */ const Call *is_wrapper() const; /** Replace every call to Functions in 'substitutions' keys by all Exprs * referenced in this Function to call to their substitute Functions (i.e. * the corresponding values in 'substitutions' map). */ // @{ Function &substitute_calls(const std::map &substitutions); Function &substitute_calls(const Function &orig, const Function &substitute); // @} /** Return true iff the name matches one of the Function's pure args. */ bool is_pure_arg(const std::string &name) const; }; /** Deep copy an entire Function DAG. */ std::pair, std::map> deep_copy( const std::vector &outputs, const std::map &env); } // namespace Internal } // namespace Halide #endif #ifndef HALIDE_MODULUS_REMAINDER_H #define HALIDE_MODULUS_REMAINDER_H /** \file * Routines for statically determining what expressions are divisible by. */ #include namespace Halide { struct Expr; namespace Internal { template class Scope; /** The result of modulus_remainder analysis. These represent strided * subsets of the integers. A ModulusRemainder object m represents all * integers x such that there exists y such that x == m.modulus * y + * m.remainder. Note that under this definition a set containing a * single integer (a constant) is represented using a modulus of * zero. These sets can be combined with several mathematical * operators in the obvious way. E.g. m1 + m2 contains (at least) all * integers x1 + x2 such that x1 belongs to m1 and x2 belongs to * m2. These combinations are conservative. If some internal math * would overflow, it defaults to all of the integers (modulus == 1, * remainder == 0). */ struct ModulusRemainder { ModulusRemainder() : modulus(1), remainder(0) { } ModulusRemainder(int64_t m, int64_t r) : modulus(m), remainder(r) { } int64_t modulus, remainder; // Take a conservatively-large union of two sets. Contains all // elements from both sets, and maybe some more stuff. static ModulusRemainder unify(const ModulusRemainder &a, const ModulusRemainder &b); // Take a conservatively-large intersection. Everything in the // result is in at least one of the two sets, but not always both. static ModulusRemainder intersect(const ModulusRemainder &a, const ModulusRemainder &b); bool operator==(const ModulusRemainder &other) const { return (modulus == other.modulus) && (remainder == other.remainder); } }; ModulusRemainder operator+(const ModulusRemainder &a, const ModulusRemainder &b); ModulusRemainder operator-(const ModulusRemainder &a, const ModulusRemainder &b); ModulusRemainder operator*(const ModulusRemainder &a, const ModulusRemainder &b); ModulusRemainder operator/(const ModulusRemainder &a, const ModulusRemainder &b); ModulusRemainder operator%(const ModulusRemainder &a, const ModulusRemainder &b); ModulusRemainder operator+(const ModulusRemainder &a, int64_t b); ModulusRemainder operator-(const ModulusRemainder &a, int64_t b); ModulusRemainder operator*(const ModulusRemainder &a, int64_t b); ModulusRemainder operator/(const ModulusRemainder &a, int64_t b); ModulusRemainder operator%(const ModulusRemainder &a, int64_t b); /** For things like alignment analysis, often it's helpful to know * if an integer expression is some multiple of a constant plus * some other constant. For example, it is straight-forward to * deduce that ((10*x + 2)*(6*y - 3) - 1) is congruent to five * modulo six. * * We get the most information when the modulus is large. E.g. if * something is congruent to 208 modulo 384, then we also know it's * congruent to 0 mod 8, and we can possibly use it as an index for an * aligned load. If all else fails, we can just say that an integer is * congruent to zero modulo one. */ ModulusRemainder modulus_remainder(const Expr &e); /** If we have alignment information about external variables, we can * let the analysis know about that using this version of * modulus_remainder: */ ModulusRemainder modulus_remainder(const Expr &e, const Scope &scope); /** Reduce an expression modulo some integer. Returns true and assigns * to remainder if an answer could be found. */ ///@{ bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder); bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder, const Scope &scope); ///@} void modulus_remainder_test(); /** The greatest common divisor of two integers */ int64_t gcd(int64_t, int64_t); /** The least common multiple of two integers */ int64_t lcm(int64_t, int64_t); } // namespace Internal } // namespace Halide #endif namespace Halide { template class Buffer; /** Enums specifying various kinds of outputs that can be produced from a Halide Pipeline. */ enum class Output { assembly, bitcode, c_header, c_source, cpp_stub, featurization, llvm_assembly, object, python_extension, pytorch_wrapper, registration, schedule, static_library, stmt, stmt_html, }; /** Type of linkage a function in a lowered Halide module can have. Also controls whether auxiliary functions and metadata are generated. */ enum class LinkageType { External, ///< Visible externally. ExternalPlusMetadata, ///< Visible externally. Argument metadata and an argv wrapper are also generated. Internal, ///< Not visible externally, similar to 'static' linkage in C. }; namespace Internal { struct OutputInfo { std::string name, extension; }; std::map get_output_info(const Target &target); /** Definition of an argument to a LoweredFunc. This is similar to * Argument, except it enables passing extra information useful to * some targets to LoweredFunc. */ struct LoweredArgument : public Argument { /** For scalar arguments, the modulus and remainder of this * argument. */ ModulusRemainder alignment; LoweredArgument() = default; explicit LoweredArgument(const Argument &arg) : Argument(arg) { } LoweredArgument(const std::string &_name, Kind _kind, const Type &_type, uint8_t _dimensions, const ArgumentEstimates &argument_estimates) : Argument(_name, _kind, _type, _dimensions, argument_estimates) { } }; /** Definition of a lowered function. This object provides a concrete * mapping between parameters used in the function body and their * declarations in the argument list. */ struct LoweredFunc { std::string name; /** Arguments referred to in the body of this function. */ std::vector args; /** Body of this function. */ Stmt body; /** The linkage of this function. */ LinkageType linkage; /** The name-mangling choice for the function. Defaults to using * the Target. */ NameMangling name_mangling; LoweredFunc(const std::string &name, const std::vector &args, Stmt body, LinkageType linkage, NameMangling mangling = NameMangling::Default); LoweredFunc(const std::string &name, const std::vector &args, Stmt body, LinkageType linkage, NameMangling mangling = NameMangling::Default); }; } // namespace Internal namespace Internal { struct ModuleContents; } struct AutoSchedulerResults; /** A halide module. This represents IR containing lowered function * definitions and buffers. */ class Module { Internal::IntrusivePtr contents; public: Module(const std::string &name, const Target &target); /** Get the target this module has been lowered for. */ const Target &target() const; /** The name of this module. This is used as the default filename * for output operations. */ const std::string &name() const; /** If this Module had an auto-generated schedule, return a read-only pointer * to the AutoSchedulerResults. If not, return nullptr. */ const AutoSchedulerResults *get_auto_scheduler_results() const; /** Return whether this module uses strict floating-point anywhere. */ bool any_strict_float() const; /** The declarations contained in this module. */ // @{ const std::vector> &buffers() const; const std::vector &functions() const; std::vector &functions(); const std::vector &submodules() const; const std::vector &external_code() const; // @} /** Return the function with the given name. If no such function * exists in this module, assert. */ Internal::LoweredFunc get_function_by_name(const std::string &name) const; /** Add a declaration to this module. */ // @{ void append(const Buffer &buffer); void append(const Internal::LoweredFunc &function); void append(const Module &module); void append(const ExternalCode &external_code); // @} /** Compile a halide Module to variety of outputs, depending on * the fields set in output_files. */ void compile(const std::map &output_files) const; /** Compile a halide Module to in-memory object code. Currently * only supports LLVM based compilation, but should be extended to * handle source code backends. */ Buffer compile_to_buffer() const; /** Return a new module with all submodules compiled to buffers on * on the result Module. */ Module resolve_submodules() const; /** When generating metadata from this module, remap any occurrences * of 'from' into 'to'. */ void remap_metadata_name(const std::string &from, const std::string &to) const; /** Retrieve the metadata name map. */ std::map get_metadata_name_map() const; /** Set the AutoSchedulerResults for the Module. It is an error to call this * multiple times for a given Module. */ void set_auto_scheduler_results(const AutoSchedulerResults &results); /** Set whether this module uses strict floating-point directives anywhere. */ void set_any_strict_float(bool any_strict_float); }; /** Link a set of modules together into one module. */ Module link_modules(const std::string &name, const std::vector &modules); /** Create an object file containing the Halide runtime for a given target. For * use with Target::NoRuntime. Standalone runtimes are only compatible with * pipelines compiled by the same build of Halide used to call this function. */ void compile_standalone_runtime(const std::string &object_filename, Target t); /** Create an object and/or static library file containing the Halide runtime * for a given target. For use with Target::NoRuntime. Standalone runtimes are * only compatible with pipelines compiled by the same build of Halide used to * call this function. Return a map with just the actual outputs filled in * (typically, Output::object and/or Output::static_library). */ std::map compile_standalone_runtime(const std::map &output_files, Target t); typedef std::function ModuleProducer; void compile_multitarget(const std::string &fn_name, const std::map &output_files, const std::vector &targets, const ModuleProducer &module_producer); } // namespace Halide #endif #ifndef HALIDE_PARAM_MAP_H #define HALIDE_PARAM_MAP_H /** \file * Defines a collection of parameters to be passed as formal arguments * to a JIT invocation. */ #include #ifndef HALIDE_PARAM_H #define HALIDE_PARAM_H #include #ifndef HALIDE_EXTERNFUNCARGUMENT_H #define HALIDE_EXTERNFUNCARGUMENT_H /** \file * Defines the internal representation of a halide ExternFuncArgument */ #ifndef HALIDE_BUFFER_H #define HALIDE_BUFFER_H #ifndef HALIDE_DEVICE_INTERFACE_H #define HALIDE_DEVICE_INTERFACE_H /** \file * Methods for managing device allocations when jitting */ namespace Halide { /** Gets the appropriate halide_device_interface_t * for a * DeviceAPI. If error_site is non-null, e.g. the name of the routine * calling get_device_interface_for_device_api, a user_error is * reported if the requested device API is not enabled in or supported * by the target, Halide has been compiled without this device API, or * the device API is None or Host or a bad value. The error_site * argument is printed in the error message. If error_site is null, * this routine returns nullptr instead of calling user_error. */ const halide_device_interface_t *get_device_interface_for_device_api(DeviceAPI d, const Target &t = get_jit_target_from_environment(), const char *error_site = nullptr); /** Get the specific DeviceAPI that Halide would select when presented * with DeviceAPI::Default_GPU for a given target. If no suitable api * is enabled in the target, returns DeviceAPI::Host. */ DeviceAPI get_default_device_api_for_target(const Target &t); /** This attempts to sniff whether a given Target (and its implied DeviceAPI) is usable on * the current host. If it appears to be usable, return true; if not, return false. * Note that a return value of true does *not* guarantee that future usage of * that device will succeed; it is intended mainly as a simple diagnostic * to allow early-exit when a desired device is definitely not usable. * Also note that this call is *NOT* threadsafe, as it temporarily redirect various * global error-handling hooks in Halide. */ bool host_supports_target_device(const Target &t); namespace Internal { /** Get an Expr which evaluates to the device interface for the given device api at runtime. */ Expr make_device_interface_call(DeviceAPI device_api); } // namespace Internal } // namespace Halide #endif /** \file * Defines a Buffer type that wraps from halide_buffer_t and adds * functionality, and methods for more conveniently iterating over the * samples in a halide_buffer_t outside of Halide code. */ #ifndef HALIDE_RUNTIME_BUFFER_H #define HALIDE_RUNTIME_BUFFER_H #include #include #include #include #include #include #include #include #if defined(__has_feature) #if __has_feature(memory_sanitizer) #include #endif #endif #ifdef _MSC_VER #define HALIDE_ALLOCA _alloca #else #define HALIDE_ALLOCA __builtin_alloca #endif // gcc 5.1 has a false positive warning on this code #if __GNUC__ == 5 && __GNUC_MINOR__ == 1 #pragma GCC diagnostic ignored "-Warray-bounds" #endif namespace Halide { namespace Runtime { // Forward-declare our Buffer class template class Buffer; // A helper to check if a parameter pack is entirely implicitly // int-convertible to use with std::enable_if template struct AllInts : std::false_type {}; template<> struct AllInts<> : std::true_type {}; template struct AllInts { static const bool value = std::is_convertible::value && AllInts::value; }; // Floats and doubles are technically implicitly int-convertible, but // doing so produces a warning we treat as an error, so just disallow // it here. template struct AllInts : std::false_type {}; template struct AllInts : std::false_type {}; // A helper to detect if there are any zeros in a container namespace Internal { template bool any_zero(const Container &c) { for (int i : c) { if (i == 0) return true; } return false; } } // namespace Internal /** A struct acting as a header for allocations owned by the Buffer * class itself. */ struct AllocationHeader { void (*deallocate_fn)(void *); std::atomic ref_count; // Note that ref_count always starts at 1 AllocationHeader(void (*deallocate_fn)(void *)) : deallocate_fn(deallocate_fn), ref_count(1) { } }; /** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */ enum struct BufferDeviceOwnership : int { Allocated, ///> halide_device_free will be called when device ref count goes to zero WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero Unmanaged, ///> No free routine will be called when device ref count goes to zero AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero. Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero. }; /** A similar struct for managing device allocations. */ struct DeviceRefCount { // This is only ever constructed when there's something to manage, // so start at one. std::atomic count{1}; BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated}; }; /** A templated Buffer class that wraps halide_buffer_t and adds * functionality. When using Halide from C++, this is the preferred * way to create input and output buffers. The overhead of using this * class relative to a naked halide_buffer_t is minimal - it uses another * ~16 bytes on the stack, and does no dynamic allocations when using * it to represent existing memory of a known maximum dimensionality. * * The template parameter T is the element type. For buffers where the * element type is unknown, or may vary, use void or const void. * * D is the maximum number of dimensions that can be represented using * space inside the class itself. Set it to the maximum dimensionality * you expect this buffer to be. If the actual dimensionality exceeds * this, heap storage is allocated to track the shape of the buffer. D * defaults to 4, which should cover nearly all usage. * * The class optionally allocates and owns memory for the image using * a shared pointer allocated with the provided allocator. If they are * null, malloc and free are used. Any device-side allocation is * considered as owned if and only if the host-side allocation is * owned. */ template class Buffer { /** The underlying halide_buffer_t */ halide_buffer_t buf = {0}; /** Some in-class storage for shape of the dimensions. */ halide_dimension_t shape[D]; /** The allocation owned by this Buffer. NULL if the Buffer does not * own the memory. */ AllocationHeader *alloc = nullptr; /** A reference count for the device allocation owned by this * buffer. */ mutable DeviceRefCount *dev_ref_count = nullptr; /** True if T is of type void or const void */ static const bool T_is_void = std::is_same::type, void>::value; /** A type function that adds a const qualifier if T is a const type. */ template using add_const_if_T_is_const = typename std::conditional::value, const T2, T2>::type; /** T unless T is (const) void, in which case (const) * uint8_t. Useful for providing return types for operator() */ using not_void_T = typename std::conditional, T>::type; /** T with constness removed. Useful for return type of copy(). */ using not_const_T = typename std::remove_const::type; /** The type the elements are stored as. Equal to not_void_T * unless T is a pointer, in which case uint64_t. Halide stores * all pointer types as uint64s internally, even on 32-bit * systems. */ using storage_T = typename std::conditional::value, uint64_t, not_void_T>::type; public: /** True if the Halide type is not void (or const void). */ static constexpr bool has_static_halide_type = !T_is_void; /** Get the Halide type of T. Callers should not use the result if * has_static_halide_type is false. */ static halide_type_t static_halide_type() { return halide_type_of::type>(); } /** Does this Buffer own the host memory it refers to? */ bool owns_host_memory() const { return alloc != nullptr; } private: /** Increment the reference count of any owned allocation */ void incref() const { if (owns_host_memory()) { alloc->ref_count++; } if (buf.device) { if (!dev_ref_count) { // I seem to have a non-zero dev field but no // reference count for it. I must have been given a // device allocation by a Halide pipeline, and have // never been copied from since. Take sole ownership // of it. dev_ref_count = new DeviceRefCount; } dev_ref_count->count++; } } // Note that this is called "cropped" but can also encompass a slice/embed // operation as well. struct DevRefCountCropped : DeviceRefCount { Buffer cropped_from; DevRefCountCropped(const Buffer &cropped_from) : cropped_from(cropped_from) { ownership = BufferDeviceOwnership::Cropped; } }; /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */ void crop_from(const Buffer &cropped_from) { assert(dev_ref_count == nullptr); dev_ref_count = new DevRefCountCropped(cropped_from); } /** Decrement the reference count of any owned allocation and free host * and device memory if it hits zero. Sets alloc to nullptr. */ void decref() { if (owns_host_memory()) { int new_count = --(alloc->ref_count); if (new_count == 0) { void (*fn)(void *) = alloc->deallocate_fn; alloc->~AllocationHeader(); fn(alloc); } buf.host = nullptr; alloc = nullptr; set_host_dirty(false); } decref_dev(); } void decref_dev() { int new_count = 0; if (dev_ref_count) { new_count = --(dev_ref_count->count); } if (new_count == 0) { if (buf.device) { assert(!(alloc && device_dirty()) && "Implicitly freeing a dirty device allocation while a host allocation still lives. " "Call device_free explicitly if you want to drop dirty device-side data. " "Call copy_to_host explicitly if you want the data copied to the host allocation " "before the device allocation is freed."); if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) { buf.device_interface->detach_native(nullptr, &buf); } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) { buf.device_interface->device_and_host_free(nullptr, &buf); } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { buf.device_interface->device_release_crop(nullptr, &buf); } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) { buf.device_interface->device_free(nullptr, &buf); } } if (dev_ref_count) { if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { delete (DevRefCountCropped *)dev_ref_count; } else { delete dev_ref_count; } } } buf.device = 0; buf.device_interface = nullptr; dev_ref_count = nullptr; } void free_shape_storage() { if (buf.dim != shape) { delete[] buf.dim; buf.dim = nullptr; } } void make_shape_storage(const int dimensions) { // This should usually be inlined, so if dimensions is statically known, // we can skip the call to new buf.dimensions = dimensions; buf.dim = (dimensions <= D) ? shape : new halide_dimension_t[dimensions]; } void copy_shape_from(const halide_buffer_t &other) { // All callers of this ensure that buf.dimensions == other.dimensions. make_shape_storage(other.dimensions); std::copy(other.dim, other.dim + other.dimensions, buf.dim); } template void move_shape_from(Buffer &&other) { if (other.shape == other.buf.dim) { copy_shape_from(other.buf); } else { buf.dim = other.buf.dim; other.buf.dim = nullptr; } } /** Initialize the shape from a halide_buffer_t. */ void initialize_from_buffer(const halide_buffer_t &b, BufferDeviceOwnership ownership) { memcpy(&buf, &b, sizeof(halide_buffer_t)); copy_shape_from(b); if (b.device) { dev_ref_count = new DeviceRefCount; dev_ref_count->ownership = ownership; } } /** Initialize the shape from an array of ints */ void initialize_shape(const int *sizes) { for (int i = 0; i < buf.dimensions; i++) { buf.dim[i].min = 0; buf.dim[i].extent = sizes[i]; if (i == 0) { buf.dim[i].stride = 1; } else { buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent; } } } /** Initialize the shape from a vector of extents */ void initialize_shape(const std::vector &sizes) { assert(buf.dimensions == (int)sizes.size()); initialize_shape(sizes.data()); } /** Initialize the shape from the static shape of an array */ template void initialize_shape_from_array_shape(int next, Array (&vals)[N]) { buf.dim[next].min = 0; buf.dim[next].extent = (int)N; if (next == 0) { buf.dim[next].stride = 1; } else { initialize_shape_from_array_shape(next - 1, vals[0]); buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent; } } /** Base case for the template recursion above. */ template void initialize_shape_from_array_shape(int, const T2 &) { } /** Get the dimensionality of a multi-dimensional C array */ template static int dimensionality_of_array(Array (&vals)[N]) { return dimensionality_of_array(vals[0]) + 1; } template static int dimensionality_of_array(const T2 &) { return 0; } /** Get the underlying halide_type_t of an array's element type. */ template static halide_type_t scalar_type_of_array(Array (&vals)[N]) { return scalar_type_of_array(vals[0]); } template static halide_type_t scalar_type_of_array(const T2 &) { return halide_type_of::type>(); } /** Crop a single dimension without handling device allocation. */ void crop_host(int d, int min, int extent) { assert(dim(d).min() <= min); assert(dim(d).max() >= min + extent - 1); int shift = min - dim(d).min(); if (buf.host != nullptr) { buf.host += shift * dim(d).stride() * type().bytes(); } buf.dim[d].min = min; buf.dim[d].extent = extent; } /** Crop as many dimensions as are in rect, without handling device allocation. */ void crop_host(const std::vector> &rect) { assert(rect.size() <= static_cast(std::numeric_limits::max())); int limit = (int)rect.size(); assert(limit <= dimensions()); for (int i = 0; i < limit; i++) { crop_host(i, rect[i].first, rect[i].second); } } void complete_device_crop(Buffer &result_host_cropped) const { assert(buf.device_interface != nullptr); if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == 0) { const Buffer *cropped_from = this; // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here? // is it possible to get to this point without incref having run at least once since // the device field was set? (I.e. in the internal logic of crop. incref might have been // called.) if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from; } result_host_cropped.crop_from(*cropped_from); } } /** slice a single dimension without handling device allocation. */ void slice_host(int d, int pos) { assert(d >= 0 && d < dimensions()); assert(pos >= dim(d).min() && pos <= dim(d).max()); buf.dimensions--; int shift = pos - buf.dim[d].min; if (buf.host != nullptr) { buf.host += shift * buf.dim[d].stride * type().bytes(); } for (int i = d; i < buf.dimensions; i++) { buf.dim[i] = buf.dim[i + 1]; } buf.dim[buf.dimensions] = {0, 0, 0}; } void complete_device_slice(Buffer &result_host_sliced, int d, int pos) const { assert(buf.device_interface != nullptr); if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == 0) { const Buffer *sliced_from = this; // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here? // is it possible to get to this point without incref having run at least once since // the device field was set? (I.e. in the internal logic of slice. incref might have been // called.) if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from; } // crop_from() is correct here, despite the fact that we are slicing. result_host_sliced.crop_from(*sliced_from); } } public: typedef T ElemType; /** Read-only access to the shape */ class Dimension { const halide_dimension_t &d; public: /** The lowest coordinate in this dimension */ HALIDE_ALWAYS_INLINE int min() const { return d.min; } /** The number of elements in memory you have to step over to * increment this coordinate by one. */ HALIDE_ALWAYS_INLINE int stride() const { return d.stride; } /** The extent of the image along this dimension */ HALIDE_ALWAYS_INLINE int extent() const { return d.extent; } /** The highest coordinate in this dimension */ HALIDE_ALWAYS_INLINE int max() const { return min() + extent() - 1; } /** An iterator class, so that you can iterate over * coordinates in a dimensions using a range-based for loop. */ struct iterator { int val; int operator*() const { return val; } bool operator!=(const iterator &other) const { return val != other.val; } iterator &operator++() { val++; return *this; } }; /** An iterator that points to the min coordinate */ HALIDE_ALWAYS_INLINE iterator begin() const { return {min()}; } /** An iterator that points to one past the max coordinate */ HALIDE_ALWAYS_INLINE iterator end() const { return {min() + extent()}; } Dimension(const halide_dimension_t &dim) : d(dim){}; }; /** Access the shape of the buffer */ HALIDE_ALWAYS_INLINE Dimension dim(int i) const { assert(i >= 0 && i < this->dimensions()); return Dimension(buf.dim[i]); } /** Access to the mins, strides, extents. Will be deprecated. Do not use. */ // @{ int min(int i) const { return dim(i).min(); } int extent(int i) const { return dim(i).extent(); } int stride(int i) const { return dim(i).stride(); } // @} /** The total number of elements this buffer represents. Equal to * the product of the extents */ size_t number_of_elements() const { size_t s = 1; for (int i = 0; i < dimensions(); i++) { s *= dim(i).extent(); } return s; } /** Get the dimensionality of the buffer. */ int dimensions() const { return buf.dimensions; } /** Get the type of the elements. */ halide_type_t type() const { return buf.type; } private: /** Offset to the element with the lowest address. If all * strides are positive, equal to zero. Offset is in elements, not bytes. */ ptrdiff_t begin_offset() const { ptrdiff_t index = 0; for (int i = 0; i < dimensions(); i++) { if (dim(i).stride() < 0) { index += dim(i).stride() * (dim(i).extent() - 1); } } return index; } /** An offset to one beyond the element with the highest address. * Offset is in elements, not bytes. */ ptrdiff_t end_offset() const { ptrdiff_t index = 0; for (int i = 0; i < dimensions(); i++) { if (dim(i).stride() > 0) { index += dim(i).stride() * (dim(i).extent() - 1); } } index += 1; return index; } public: /** A pointer to the element with the lowest address. If all * strides are positive, equal to the host pointer. */ T *begin() const { assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer. return (T *)(buf.host + begin_offset() * type().bytes()); } /** A pointer to one beyond the element with the highest address. */ T *end() const { assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer. return (T *)(buf.host + end_offset() * type().bytes()); } /** The total number of bytes spanned by the data in memory. */ size_t size_in_bytes() const { return (size_t)(end_offset() - begin_offset()) * type().bytes(); } /** Reset the Buffer to be equivalent to a default-constructed Buffer * of the same static type (if any); Buffer will have its runtime * type reset to uint8. */ void reset() { *this = Buffer(); } Buffer() : shape() { buf.type = static_halide_type(); make_shape_storage(0); } /** Make a Buffer from a halide_buffer_t */ explicit Buffer(const halide_buffer_t &buf, BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) { assert(T_is_void || buf.type == static_halide_type()); initialize_from_buffer(buf, ownership); } /** Give Buffers access to the members of Buffers of different dimensionalities and types. */ template friend class Buffer; private: template static void static_assert_can_convert_from() { static_assert((!std::is_const::value || std::is_const::value), "Can't convert from a Buffer to a Buffer"); static_assert(std::is_same::type, typename std::remove_const::type>::value || T_is_void || Buffer::T_is_void, "type mismatch constructing Buffer"); } public: /** Determine if if an Buffer can be constructed from some other Buffer type. * If this can be determined at compile time, fail with a static assert; otherwise * return a boolean based on runtime typing. */ template static bool can_convert_from(const Buffer &other) { static_assert_can_convert_from(); if (Buffer::T_is_void && !T_is_void) { return other.type() == static_halide_type(); } return true; } /** Fail an assertion at runtime or compile-time if an Buffer * cannot be constructed from some other Buffer type. */ template static void assert_can_convert_from(const Buffer &other) { // Explicitly call static_assert_can_convert_from() here so // that we always get compile-time checking, even if compiling with // assertions disabled. static_assert_can_convert_from(); assert(can_convert_from(other)); } /** Copy constructor. Does not copy underlying data. */ Buffer(const Buffer &other) : buf(other.buf), alloc(other.alloc) { other.incref(); dev_ref_count = other.dev_ref_count; copy_shape_from(other.buf); } /** Construct a Buffer from a Buffer of different dimensionality * and type. Asserts that the type matches (at runtime, if one of * the types is void). Note that this constructor is * implicit. This, for example, lets you pass things like * Buffer or Buffer to functions expected * Buffer. */ template Buffer(const Buffer &other) : buf(other.buf), alloc(other.alloc) { assert_can_convert_from(other); other.incref(); dev_ref_count = other.dev_ref_count; copy_shape_from(other.buf); } /** Move constructor */ Buffer(Buffer &&other) noexcept : buf(other.buf), alloc(other.alloc), dev_ref_count(other.dev_ref_count) { other.dev_ref_count = nullptr; other.alloc = nullptr; move_shape_from(std::forward>(other)); other.buf = halide_buffer_t(); } /** Move-construct a Buffer from a Buffer of different * dimensionality and type. Asserts that the types match (at * runtime if one of the types is void). */ template Buffer(Buffer &&other) : buf(other.buf), alloc(other.alloc), dev_ref_count(other.dev_ref_count) { assert_can_convert_from(other); other.dev_ref_count = nullptr; other.alloc = nullptr; move_shape_from(std::forward>(other)); other.buf = halide_buffer_t(); } /** Assign from another Buffer of possibly-different * dimensionality and type. Asserts that the types match (at * runtime if one of the types is void). */ template Buffer &operator=(const Buffer &other) { if ((const void *)this == (const void *)&other) { return *this; } assert_can_convert_from(other); other.incref(); decref(); dev_ref_count = other.dev_ref_count; alloc = other.alloc; free_shape_storage(); buf = other.buf; copy_shape_from(other.buf); return *this; } /** Standard assignment operator */ Buffer &operator=(const Buffer &other) { if (this == &other) { return *this; } other.incref(); decref(); dev_ref_count = other.dev_ref_count; alloc = other.alloc; free_shape_storage(); buf = other.buf; copy_shape_from(other.buf); return *this; } /** Move from another Buffer of possibly-different * dimensionality and type. Asserts that the types match (at * runtime if one of the types is void). */ template Buffer &operator=(Buffer &&other) { assert_can_convert_from(other); decref(); alloc = other.alloc; other.alloc = nullptr; dev_ref_count = other.dev_ref_count; other.dev_ref_count = nullptr; free_shape_storage(); buf = other.buf; move_shape_from(std::forward>(other)); other.buf = halide_buffer_t(); return *this; } /** Standard move-assignment operator */ Buffer &operator=(Buffer &&other) noexcept { decref(); alloc = other.alloc; other.alloc = nullptr; dev_ref_count = other.dev_ref_count; other.dev_ref_count = nullptr; free_shape_storage(); buf = other.buf; move_shape_from(std::forward>(other)); other.buf = halide_buffer_t(); return *this; } /** Check the product of the extents fits in memory. */ void check_overflow() { size_t size = type().bytes(); for (int i = 0; i < dimensions(); i++) { size *= dim(i).extent(); } // We allow 2^31 or 2^63 bytes, so drop the top bit. size = (size << 1) >> 1; for (int i = 0; i < dimensions(); i++) { size /= dim(i).extent(); } assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer."); } /** Allocate memory for this Buffer. Drops the reference to any * owned memory. */ void allocate(void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr) { if (!allocate_fn) { allocate_fn = malloc; } if (!deallocate_fn) { deallocate_fn = free; } // Drop any existing allocation deallocate(); // Conservatively align images to 128 bytes. This is enough // alignment for all the platforms we might use. size_t size = size_in_bytes(); const size_t alignment = 128; size = (size + alignment - 1) & ~(alignment - 1); void *alloc_storage = allocate_fn(size + sizeof(AllocationHeader) + alignment - 1); alloc = new (alloc_storage) AllocationHeader(deallocate_fn); uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader); buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1)); } /** Drop reference to any owned host or device memory, possibly * freeing it, if this buffer held the last reference to * it. Retains the shape of the buffer. Does nothing if this * buffer did not allocate its own memory. */ void deallocate() { decref(); } /** Drop reference to any owned device memory, possibly freeing it * if this buffer held the last reference to it. Asserts that * device_dirty is false. */ void device_deallocate() { decref_dev(); } /** Allocate a new image of the given size with a runtime * type. Only used when you do know what size you want but you * don't know statically what type the elements are. Pass zeroes * to make a buffer suitable for bounds query calls. */ template::value>::type> Buffer(halide_type_t t, int first, Args... rest) { if (!T_is_void) { assert(static_halide_type() == t); } int extents[] = {first, (int)rest...}; buf.type = t; constexpr int buf_dimensions = 1 + (int)(sizeof...(rest)); make_shape_storage(buf_dimensions); initialize_shape(extents); if (!Internal::any_zero(extents)) { check_overflow(); allocate(); } } /** Allocate a new image of the given size. Pass zeroes to make a * buffer suitable for bounds query calls. */ // @{ // The overload with one argument is 'explicit', so that // (say) int is not implicitly convertable to Buffer explicit Buffer(int first) { static_assert(!T_is_void, "To construct an Buffer, pass a halide_type_t as the first argument to the constructor"); int extents[] = {first}; buf.type = static_halide_type(); constexpr int buf_dimensions = 1; make_shape_storage(buf_dimensions); initialize_shape(extents); if (first != 0) { check_overflow(); allocate(); } } template::value>::type> Buffer(int first, int second, Args... rest) { static_assert(!T_is_void, "To construct an Buffer, pass a halide_type_t as the first argument to the constructor"); int extents[] = {first, second, (int)rest...}; buf.type = static_halide_type(); constexpr int buf_dimensions = 2 + (int)(sizeof...(rest)); make_shape_storage(buf_dimensions); initialize_shape(extents); if (!Internal::any_zero(extents)) { check_overflow(); allocate(); } } // @} /** Allocate a new image of unknown type using a vector of ints as the size. */ Buffer(halide_type_t t, const std::vector &sizes) { if (!T_is_void) { assert(static_halide_type() == t); } buf.type = t; make_shape_storage((int)sizes.size()); initialize_shape(sizes); if (!Internal::any_zero(sizes)) { check_overflow(); allocate(); } } /** Allocate a new image of known type using a vector of ints as the size. */ explicit Buffer(const std::vector &sizes) : Buffer(static_halide_type(), sizes) { } private: // Create a copy of the sizes vector, ordered as specified by order. static std::vector make_ordered_sizes(const std::vector &sizes, const std::vector &order) { assert(order.size() == sizes.size()); std::vector ordered_sizes(sizes.size()); for (size_t i = 0; i < sizes.size(); ++i) { ordered_sizes[i] = sizes.at(order[i]); } return ordered_sizes; } public: /** Allocate a new image of unknown type using a vector of ints as the size and * a vector of indices indicating the storage order for each dimension. The * length of the sizes vector and the storage-order vector must match. For instance, * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */ Buffer(halide_type_t t, const std::vector &sizes, const std::vector &storage_order) : Buffer(t, make_ordered_sizes(sizes, storage_order)) { transpose(storage_order); } Buffer(const std::vector &sizes, const std::vector &storage_order) : Buffer(static_halide_type(), sizes, storage_order) { } /** Make an Buffer that refers to a statically sized array. Does not * take ownership of the data, and does not set the host_dirty flag. */ template explicit Buffer(Array (&vals)[N]) { const int buf_dimensions = dimensionality_of_array(vals); buf.type = scalar_type_of_array(vals); buf.host = (uint8_t *)vals; make_shape_storage(buf_dimensions); initialize_shape_from_array_shape(buf.dimensions - 1, vals); } /** Initialize an Buffer of runtime type from a pointer and some * sizes. Assumes dense row-major packing and a min coordinate of * zero. Does not take ownership of the data and does not set the * host_dirty flag. */ template::value>::type> explicit Buffer(halide_type_t t, add_const_if_T_is_const *data, int first, Args &&... rest) { if (!T_is_void) { assert(static_halide_type() == t); } int extents[] = {first, (int)rest...}; buf.type = t; constexpr int buf_dimensions = 1 + (int)(sizeof...(rest)); buf.host = (uint8_t *)const_cast(data); make_shape_storage(buf_dimensions); initialize_shape(extents); } /** Initialize an Buffer from a pointer and some sizes. Assumes * dense row-major packing and a min coordinate of zero. Does not * take ownership of the data and does not set the host_dirty flag. */ template::value>::type> explicit Buffer(T *data, int first, Args &&... rest) { int extents[] = {first, (int)rest...}; buf.type = static_halide_type(); constexpr int buf_dimensions = 1 + (int)(sizeof...(rest)); buf.host = (uint8_t *)const_cast::type *>(data); make_shape_storage(buf_dimensions); initialize_shape(extents); } /** Initialize an Buffer from a pointer and a vector of * sizes. Assumes dense row-major packing and a min coordinate of * zero. Does not take ownership of the data and does not set the * host_dirty flag. */ explicit Buffer(T *data, const std::vector &sizes) { buf.type = static_halide_type(); buf.host = (uint8_t *)const_cast::type *>(data); make_shape_storage((int)sizes.size()); initialize_shape(sizes); } /** Initialize an Buffer of runtime type from a pointer and a * vector of sizes. Assumes dense row-major packing and a min * coordinate of zero. Does not take ownership of the data and * does not set the host_dirty flag. */ explicit Buffer(halide_type_t t, add_const_if_T_is_const *data, const std::vector &sizes) { if (!T_is_void) { assert(static_halide_type() == t); } buf.type = t; buf.host = (uint8_t *)const_cast(data); make_shape_storage((int)sizes.size()); initialize_shape(sizes); } /** Initialize an Buffer from a pointer to the min coordinate and * an array describing the shape. Does not take ownership of the * data, and does not set the host_dirty flag. */ explicit Buffer(halide_type_t t, add_const_if_T_is_const *data, int d, const halide_dimension_t *shape) { if (!T_is_void) { assert(static_halide_type() == t); } buf.type = t; buf.host = (uint8_t *)const_cast(data); make_shape_storage(d); for (int i = 0; i < d; i++) { buf.dim[i] = shape[i]; } } /** Initialize a Buffer from a pointer to the min coordinate and * a vector describing the shape. Does not take ownership of the * data, and does not set the host_dirty flag. */ explicit inline Buffer(halide_type_t t, add_const_if_T_is_const *data, const std::vector &shape) : Buffer(t, data, (int)shape.size(), shape.data()) { } /** Initialize an Buffer from a pointer to the min coordinate and * an array describing the shape. Does not take ownership of the * data and does not set the host_dirty flag. */ explicit Buffer(T *data, int d, const halide_dimension_t *shape) { buf.type = static_halide_type(); buf.host = (uint8_t *)const_cast::type *>(data); make_shape_storage(d); for (int i = 0; i < d; i++) { buf.dim[i] = shape[i]; } } /** Initialize a Buffer from a pointer to the min coordinate and * a vector describing the shape. Does not take ownership of the * data, and does not set the host_dirty flag. */ explicit inline Buffer(T *data, const std::vector &shape) : Buffer(data, (int)shape.size(), shape.data()) { } /** Destructor. Will release any underlying owned allocation if * this is the last reference to it. Will assert fail if there are * weak references to this Buffer outstanding. */ ~Buffer() { free_shape_storage(); decref(); } /** Get a pointer to the raw halide_buffer_t this wraps. */ // @{ halide_buffer_t *raw_buffer() { return &buf; } const halide_buffer_t *raw_buffer() const { return &buf; } // @} /** Provide a cast operator to halide_buffer_t *, so that * instances can be passed directly to Halide filters. */ operator halide_buffer_t *() { return &buf; } /** Return a typed reference to this Buffer. Useful for converting * a reference to a Buffer to a reference to, for example, a * Buffer, or converting a Buffer& to Buffer&. * Does a runtime assert if the source buffer type is void. */ template::type> HALIDE_ALWAYS_INLINE Buffer & as() & { Buffer::assert_can_convert_from(*this); return *((Buffer *)this); } /** Return a const typed reference to this Buffer. Useful for * converting a conference reference to one Buffer type to a const * reference to another Buffer type. Does a runtime assert if the * source buffer type is void. */ template::type> HALIDE_ALWAYS_INLINE const Buffer &as() const & { Buffer::assert_can_convert_from(*this); return *((const Buffer *)this); } /** Returns this rval Buffer with a different type attached. Does * a dynamic type check if the source type is void. */ template HALIDE_ALWAYS_INLINE Buffer as() && { Buffer::assert_can_convert_from(*this); return *((Buffer *)this); } /** as_const() is syntactic sugar for .as(), to avoid the need * to recapitulate the type argument. */ // @{ HALIDE_ALWAYS_INLINE Buffer::type, D> &as_const() & { // Note that we can skip the assert_can_convert_from(), since T -> const T // conversion is always legal. return *((Buffer::type> *)this); } HALIDE_ALWAYS_INLINE const Buffer::type, D> &as_const() const & { return *((const Buffer::type> *)this); } HALIDE_ALWAYS_INLINE Buffer::type, D> as_const() && { return *((Buffer::type> *)this); } // @} /** Conventional names for the first three dimensions. */ // @{ int width() const { return (dimensions() > 0) ? dim(0).extent() : 1; } int height() const { return (dimensions() > 1) ? dim(1).extent() : 1; } int channels() const { return (dimensions() > 2) ? dim(2).extent() : 1; } // @} /** Conventional names for the min and max value of each dimension */ // @{ int left() const { return dim(0).min(); } int right() const { return dim(0).max(); } int top() const { return dim(1).min(); } int bottom() const { return dim(1).max(); } // @} /** Make a new image which is a deep copy of this image. Use crop * or slice followed by copy to make a copy of only a portion of * the image. The new image uses the same memory layout as the * original, with holes compacted away. Note that the returned * Buffer is always of a non-const type T (ie: * * Buffer.copy() -> Buffer rather than Buffer * * which is always safe, since we are making a deep copy. (The caller * can easily cast it back to Buffer if desired, which is * always safe and free.) */ Buffer copy(void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr) const { Buffer dst = Buffer::make_with_shape_of(*this, allocate_fn, deallocate_fn); dst.copy_from(*this); return dst; } /** Like copy(), but the copy is created in interleaved memory layout * (vs. keeping the same memory layout as the original). Requires that 'this' * has exactly 3 dimensions. */ Buffer copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr) const { assert(dimensions() == 3); Buffer dst = Buffer::make_interleaved(nullptr, width(), height(), channels()); dst.set_min(min(0), min(1), min(2)); dst.allocate(allocate_fn, deallocate_fn); dst.copy_from(*this); return dst; } /** Like copy(), but the copy is created in planar memory layout * (vs. keeping the same memory layout as the original). */ Buffer copy_to_planar(void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr) const { std::vector mins, extents; const int dims = dimensions(); mins.reserve(dims); extents.reserve(dims); for (int d = 0; d < dims; ++d) { mins.push_back(dim(d).min()); extents.push_back(dim(d).extent()); } Buffer dst = Buffer(nullptr, extents); dst.set_min(mins); dst.allocate(allocate_fn, deallocate_fn); dst.copy_from(*this); return dst; } /** Make a copy of the Buffer which shares the underlying host and/or device * allocations as the existing Buffer. This is purely syntactic sugar for * cases where you have a const reference to a Buffer but need a temporary * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse * inline way to create a temporary. \code * void call_my_func(const Buffer& input) { * my_func(input.alias(), output); * }\endcode */ inline Buffer alias() const { return *this; } /** Fill a Buffer with the values at the same coordinates in * another Buffer. Restricts itself to coordinates contained * within the intersection of the two buffers. If the two Buffers * are not in the same coordinate system, you will need to * translate the argument Buffer first. E.g. if you're blitting a * sprite onto a framebuffer, you'll want to translate the sprite * to the correct location first like so: \code * framebuffer.copy_from(sprite.translated({x, y})); \endcode */ template void copy_from(const Buffer &other) { static_assert(!std::is_const::value, "Cannot call copy_from() on a Buffer"); assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination."); assert(!other.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source."); Buffer src(other); Buffer dst(*this); assert(src.dimensions() == dst.dimensions()); // Trim the copy to the region in common for (int i = 0; i < dimensions(); i++) { int min_coord = std::max(dst.dim(i).min(), src.dim(i).min()); int max_coord = std::min(dst.dim(i).max(), src.dim(i).max()); if (max_coord < min_coord) { // The buffers do not overlap. return; } dst.crop(i, min_coord, max_coord - min_coord + 1); src.crop(i, min_coord, max_coord - min_coord + 1); } // If T is void, we need to do runtime dispatch to an // appropriately-typed lambda. We're copying, so we only care // about the element size. (If not, this should optimize away // into a static dispatch to the right-sized copy.) if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) { using MemType = uint8_t; auto &typed_dst = (Buffer &)dst; auto &typed_src = (Buffer &)src; typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) { using MemType = uint16_t; auto &typed_dst = (Buffer &)dst; auto &typed_src = (Buffer &)src; typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) { using MemType = uint32_t; auto &typed_dst = (Buffer &)dst; auto &typed_src = (Buffer &)src; typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) { using MemType = uint64_t; auto &typed_dst = (Buffer &)dst; auto &typed_src = (Buffer &)src; typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); } else { assert(false && "type().bytes() must be 1, 2, 4, or 8"); } set_host_dirty(); } /** Make an image that refers to a sub-range of this image along * the given dimension. Asserts that the crop region is within * the existing bounds: you cannot "crop outwards", even if you know there * is valid Buffer storage (e.g. because you already cropped inwards). */ Buffer cropped(int d, int min, int extent) const { // Make a fresh copy of the underlying buffer (but not a fresh // copy of the allocation, if there is one). Buffer im = *this; // This guarantees the prexisting device ref is dropped if the // device_crop call fails and maintains the buffer in a consistent // state. im.device_deallocate(); im.crop_host(d, min, extent); if (buf.device_interface != nullptr) { complete_device_crop(im); } return im; } /** Crop an image in-place along the given dimension. This does * not move any data around in memory - it just changes the min * and extent of the given dimension. */ void crop(int d, int min, int extent) { // An optimization for non-device buffers. For the device case, // a temp buffer is required, so reuse the not-in-place version. // TODO(zalman|abadams): Are nop crops common enough to special // case the device part of the if to do nothing? if (buf.device_interface != nullptr) { *this = cropped(d, min, extent); } else { crop_host(d, min, extent); } } /** Make an image that refers to a sub-rectangle of this image along * the first N dimensions. Asserts that the crop region is within * the existing bounds. The cropped image may drop any device handle * if the device_interface cannot accomplish the crop in-place. */ Buffer cropped(const std::vector> &rect) const { // Make a fresh copy of the underlying buffer (but not a fresh // copy of the allocation, if there is one). Buffer im = *this; // This guarantees the prexisting device ref is dropped if the // device_crop call fails and maintains the buffer in a consistent // state. im.device_deallocate(); im.crop_host(rect); if (buf.device_interface != nullptr) { complete_device_crop(im); } return im; } /** Crop an image in-place along the first N dimensions. This does * not move any data around in memory, nor does it free memory. It * just rewrites the min/extent of each dimension to refer to a * subregion of the same allocation. */ void crop(const std::vector> &rect) { // An optimization for non-device buffers. For the device case, // a temp buffer is required, so reuse the not-in-place version. // TODO(zalman|abadams): Are nop crops common enough to special // case the device part of the if to do nothing? if (buf.device_interface != nullptr) { *this = cropped(rect); } else { crop_host(rect); } } /** Make an image which refers to the same data with using * translated coordinates in the given dimension. Positive values * move the image data to the right or down relative to the * coordinate system. Drops any device handle. */ Buffer translated(int d, int dx) const { Buffer im = *this; im.translate(d, dx); return im; } /** Translate an image in-place along one dimension by changing * how it is indexed. Does not move any data around in memory. */ void translate(int d, int delta) { assert(d >= 0 && d < this->dimensions()); device_deallocate(); buf.dim[d].min += delta; } /** Make an image which refers to the same data translated along * the first N dimensions. */ Buffer translated(const std::vector &delta) const { Buffer im = *this; im.translate(delta); return im; } /** Translate an image along the first N dimensions by changing * how it is indexed. Does not move any data around in memory. */ void translate(const std::vector &delta) { device_deallocate(); assert(delta.size() <= static_cast(std::numeric_limits::max())); int limit = (int)delta.size(); assert(limit <= dimensions()); for (int i = 0; i < limit; i++) { translate(i, delta[i]); } } /** Set the min coordinate of an image in the first N dimensions. */ // @{ void set_min(const std::vector &mins) { assert(mins.size() <= static_cast(dimensions())); device_deallocate(); for (size_t i = 0; i < mins.size(); i++) { buf.dim[i].min = mins[i]; } } template void set_min(Args... args) { set_min(std::vector{args...}); } // @} /** Test if a given coordinate is within the bounds of an image. */ // @{ bool contains(const std::vector &coords) const { assert(coords.size() <= static_cast(dimensions())); for (size_t i = 0; i < coords.size(); i++) { if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) { return false; } } return true; } template bool contains(Args... args) const { return contains(std::vector{args...}); } // @} /** Make a buffer which refers to the same data in the same layout * using a swapped indexing order for the dimensions given. So * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more * strongly that A.address_of(i, j) == B.address_of(j, i). */ Buffer transposed(int d1, int d2) const { Buffer im = *this; im.transpose(d1, d2); return im; } /** Transpose a buffer in-place by changing how it is indexed. For * example, transpose(0, 1) on a two-dimensional buffer means that * the value referred to by coordinates (i, j) is now reached at * the coordinates (j, i), and vice versa. This is done by * reordering the per-dimension metadata rather than by moving * data around in memory, so other views of the same memory will * not see the data as having been transposed. */ void transpose(int d1, int d2) { assert(d1 >= 0 && d1 < this->dimensions()); assert(d2 >= 0 && d2 < this->dimensions()); std::swap(buf.dim[d1], buf.dim[d2]); } /** A generalized transpose: instead of swapping two dimensions, * pass a vector that lists each dimension index exactly once, in * the desired order. This does not move any data around in memory * - it just permutes how it is indexed. */ void transpose(const std::vector &order) { assert((int)order.size() == dimensions()); if (dimensions() < 2) { // My, that was easy return; } std::vector order_sorted = order; for (size_t i = 1; i < order_sorted.size(); i++) { for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) { std::swap(order_sorted[j], order_sorted[j - 1]); transpose(j, j - 1); } } } /** Make a buffer which refers to the same data in the same * layout using a different ordering of the dimensions. */ Buffer transposed(const std::vector &order) const { Buffer im = *this; im.transpose(order); return im; } /** Make a lower-dimensional buffer that refers to one slice of * this buffer. */ Buffer sliced(int d, int pos) const { Buffer im = *this; // This guarantees the prexisting device ref is dropped if the // device_slice call fails and maintains the buffer in a consistent // state. im.device_deallocate(); im.slice_host(d, pos); if (buf.device_interface != nullptr) { complete_device_slice(im, d, pos); } return im; } /** Make a lower-dimensional buffer that refers to one slice of this * buffer at the dimension's minimum. */ inline Buffer sliced(int d) const { return sliced(d, dim(d).min()); } /** Rewrite the buffer to refer to a single lower-dimensional * slice of itself along the given dimension at the given * coordinate. Does not move any data around or free the original * memory, so other views of the same data are unaffected. */ void slice(int d, int pos) { // An optimization for non-device buffers. For the device case, // a temp buffer is required, so reuse the not-in-place version. // TODO(zalman|abadams): Are nop slices common enough to special // case the device part of the if to do nothing? if (buf.device_interface != nullptr) { *this = sliced(d, pos); } else { slice_host(d, pos); } } /** Slice a buffer in-place at the dimension's minimum. */ inline void slice(int d) { slice(d, dim(d).min()); } /** Make a new buffer that views this buffer as a single slice in a * higher-dimensional space. The new dimension has extent one and * the given min. This operation is the opposite of slice. As an * example, the following condition is true: * \code im2 = im.embedded(1, 17); &im(x, y, c) == &im2(x, 17, y, c); \endcode */ Buffer embedded(int d, int pos = 0) const { Buffer im(*this); im.embed(d, pos); return im; } /** Embed a buffer in-place, increasing the * dimensionality. */ void embed(int d, int pos = 0) { assert(d >= 0 && d <= dimensions()); add_dimension(); translate(dimensions() - 1, pos); for (int i = dimensions() - 1; i > d; i--) { transpose(i, i - 1); } } /** Add a new dimension with a min of zero and an extent of * one. The stride is the extent of the outermost dimension times * its stride. The new dimension is the last dimension. This is a * special case of embed. */ void add_dimension() { const int dims = buf.dimensions; buf.dimensions++; if (buf.dim != shape) { // We're already on the heap. Reallocate. halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions]; for (int i = 0; i < dims; i++) { new_shape[i] = buf.dim[i]; } delete[] buf.dim; buf.dim = new_shape; } else if (dims == D) { // Transition from the in-class storage to the heap make_shape_storage(buf.dimensions); for (int i = 0; i < dims; i++) { buf.dim[i] = shape[i]; } } else { // We still fit in the class } buf.dim[dims] = {0, 1, 0}; if (dims == 0) { buf.dim[dims].stride = 1; } else { buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride; } } /** Add a new dimension with a min of zero, an extent of one, and * the specified stride. The new dimension is the last * dimension. This is a special case of embed. */ void add_dimension_with_stride(int s) { add_dimension(); buf.dim[buf.dimensions - 1].stride = s; } /** Methods for managing any GPU allocation. */ // @{ // Set the host dirty flag. Called by every operator() // access. Must be inlined so it can be hoisted out of loops. HALIDE_ALWAYS_INLINE void set_host_dirty(bool v = true) { assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host."); buf.set_host_dirty(v); } // Check if the device allocation is dirty. Called by // set_host_dirty, which is called by every accessor. Must be // inlined so it can be hoisted out of loops. HALIDE_ALWAYS_INLINE bool device_dirty() const { return buf.device_dirty(); } bool host_dirty() const { return buf.host_dirty(); } void set_device_dirty(bool v = true) { assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty."); buf.set_device_dirty(v); } int copy_to_host(void *ctx = nullptr) { if (device_dirty()) { return buf.device_interface->copy_to_host(ctx, &buf); } return 0; } int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { if (host_dirty()) { return device_interface->copy_to_device(ctx, &buf, device_interface); } return 0; } int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { return device_interface->device_malloc(ctx, &buf, device_interface); } int device_free(void *ctx = nullptr) { if (dev_ref_count) { assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated && "Can't call device_free on an unmanaged or wrapped native device handle. " "Free the source allocation or call device_detach_native instead."); // Multiple people may be holding onto this dev field assert(dev_ref_count->count == 1 && "Multiple Halide::Runtime::Buffer objects share this device " "allocation. Freeing it would create dangling references. " "Don't call device_free on Halide buffers that you have copied or " "passed by value."); } int ret = 0; if (buf.device_interface) { ret = buf.device_interface->device_free(ctx, &buf); } if (dev_ref_count) { delete dev_ref_count; dev_ref_count = nullptr; } return ret; } int device_wrap_native(const struct halide_device_interface_t *device_interface, uint64_t handle, void *ctx = nullptr) { assert(device_interface); dev_ref_count = new DeviceRefCount; dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative; return device_interface->wrap_native(ctx, &buf, handle, device_interface); } int device_detach_native(void *ctx = nullptr) { assert(dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative && "Only call device_detach_native on buffers wrapping a native " "device handle via device_wrap_native. This buffer was allocated " "using device_malloc, or is unmanaged. " "Call device_free or free the original allocation instead."); // Multiple people may be holding onto this dev field assert(dev_ref_count->count == 1 && "Multiple Halide::Runtime::Buffer objects share this device " "allocation. Freeing it could create dangling references. " "Don't call device_detach_native on Halide buffers that you " "have copied or passed by value."); int ret = 0; if (buf.device_interface) { ret = buf.device_interface->detach_native(ctx, &buf); } delete dev_ref_count; dev_ref_count = nullptr; return ret; } int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { return device_interface->device_and_host_malloc(ctx, &buf, device_interface); } int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { if (dev_ref_count) { assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost && "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. " "Free the source allocation or call device_detach_native instead."); // Multiple people may be holding onto this dev field assert(dev_ref_count->count == 1 && "Multiple Halide::Runtime::Buffer objects share this device " "allocation. Freeing it would create dangling references. " "Don't call device_and_host_free on Halide buffers that you have copied or " "passed by value."); } int ret = 0; if (buf.device_interface) { ret = buf.device_interface->device_and_host_free(ctx, &buf); } if (dev_ref_count) { delete dev_ref_count; dev_ref_count = nullptr; } return ret; } int device_sync(void *ctx = nullptr) { if (buf.device_interface) { return buf.device_interface->device_sync(ctx, &buf); } else { return 0; } } bool has_device_allocation() const { return buf.device != 0; } /** Return the method by which the device field is managed. */ BufferDeviceOwnership device_ownership() const { if (dev_ref_count == nullptr) { return BufferDeviceOwnership::Allocated; } return dev_ref_count->ownership; } // @} /** If you use the (x, y, c) indexing convention, then Halide * Buffers are stored planar by default. This function constructs * an interleaved RGB or RGBA image that can still be indexed * using (x, y, c). Passing it to a generator requires that the * generator has been compiled with support for interleaved (also * known as packed or chunky) memory layouts. */ static Buffer make_interleaved(halide_type_t t, int width, int height, int channels) { Buffer im(t, channels, width, height); // Note that this is equivalent to calling transpose({2, 0, 1}), // but slightly more efficient. im.transpose(0, 1); im.transpose(1, 2); return im; } /** If you use the (x, y, c) indexing convention, then Halide * Buffers are stored planar by default. This function constructs * an interleaved RGB or RGBA image that can still be indexed * using (x, y, c). Passing it to a generator requires that the * generator has been compiled with support for interleaved (also * known as packed or chunky) memory layouts. */ static Buffer make_interleaved(int width, int height, int channels) { return make_interleaved(static_halide_type(), width, height, channels); } /** Wrap an existing interleaved image. */ static Buffer, D> make_interleaved(halide_type_t t, T *data, int width, int height, int channels) { Buffer, D> im(t, data, channels, width, height); im.transpose(0, 1); im.transpose(1, 2); return im; } /** Wrap an existing interleaved image. */ static Buffer make_interleaved(T *data, int width, int height, int channels) { return make_interleaved(static_halide_type(), data, width, height, channels); } /** Make a zero-dimensional Buffer */ static Buffer, D> make_scalar(halide_type_t t) { Buffer, 1> buf(t, 1); buf.slice(0, 0); return buf; } /** Make a zero-dimensional Buffer */ static Buffer make_scalar() { Buffer buf(1); buf.slice(0, 0); return buf; } /** Make a zero-dimensional Buffer that points to non-owned, existing data */ static Buffer make_scalar(T *data) { Buffer buf(data, 1); buf.slice(0, 0); return buf; } /** Make a buffer with the same shape and memory nesting order as * another buffer. It may have a different type. */ template static Buffer make_with_shape_of(Buffer src, void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr) { const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of::type>(); return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim, allocate_fn, deallocate_fn); } private: static Buffer<> make_with_shape_of_helper(halide_type_t dst_type, int dimensions, halide_dimension_t *shape, void *(*allocate_fn)(size_t), void (*deallocate_fn)(void *)) { // Reorder the dimensions of src to have strides in increasing order std::vector swaps; for (int i = dimensions - 1; i > 0; i--) { for (int j = i; j > 0; j--) { if (shape[j - 1].stride > shape[j].stride) { std::swap(shape[j - 1], shape[j]); swaps.push_back(j); } } } // Rewrite the strides to be dense (this messes up src, which // is why we took it by value). for (int i = 0; i < dimensions; i++) { if (i == 0) { shape[i].stride = 1; } else { shape[i].stride = shape[i - 1].extent * shape[i - 1].stride; } } // Undo the dimension reordering while (!swaps.empty()) { int j = swaps.back(); std::swap(shape[j - 1], shape[j]); swaps.pop_back(); } // Use an explicit runtime type, and make dst a Buffer, to allow // using this method with Buffer for either src or dst. Buffer<> dst(dst_type, nullptr, dimensions, shape); dst.allocate(allocate_fn, deallocate_fn); return dst; } template HALIDE_ALWAYS_INLINE ptrdiff_t offset_of(int d, int first, Args... rest) const { return offset_of(d + 1, rest...) + this->buf.dim[d].stride * (first - this->buf.dim[d].min); } HALIDE_ALWAYS_INLINE ptrdiff_t offset_of(int d) const { return 0; } template HALIDE_ALWAYS_INLINE storage_T * address_of(Args... args) const { if (T_is_void) { return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes(); } else { return (storage_T *)(this->buf.host) + offset_of(0, args...); } } HALIDE_ALWAYS_INLINE ptrdiff_t offset_of(const int *pos) const { ptrdiff_t offset = 0; for (int i = this->dimensions() - 1; i >= 0; i--) { offset += this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min); } return offset; } HALIDE_ALWAYS_INLINE storage_T *address_of(const int *pos) const { if (T_is_void) { return (storage_T *)this->buf.host + offset_of(pos) * type().bytes(); } else { return (storage_T *)this->buf.host + offset_of(pos); } } public: /** Get a pointer to the address of the min coordinate. */ T *data() const { return (T *)(this->buf.host); } /** Access elements. Use im(...) to get a reference to an element, * and use &im(...) to get the address of an element. If you pass * fewer arguments than the buffer has dimensions, the rest are * treated as their min coordinate. The non-const versions set the * host_dirty flag to true. */ //@{ template::value>::type> HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); assert(!device_dirty()); return *((const not_void_T *)(address_of(first, rest...))); } HALIDE_ALWAYS_INLINE const not_void_T & operator()() const { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); assert(!device_dirty()); return *((const not_void_T *)(data())); } HALIDE_ALWAYS_INLINE const not_void_T & operator()(const int *pos) const { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); assert(!device_dirty()); return *((const not_void_T *)(address_of(pos))); } template::value>::type> HALIDE_ALWAYS_INLINE not_void_T & operator()(int first, Args... rest) { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); set_host_dirty(); return *((not_void_T *)(address_of(first, rest...))); } HALIDE_ALWAYS_INLINE not_void_T & operator()() { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); set_host_dirty(); return *((not_void_T *)(data())); } HALIDE_ALWAYS_INLINE not_void_T & operator()(const int *pos) { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); set_host_dirty(); return *((not_void_T *)(address_of(pos))); } // @} /** Tests that all values in this buffer are equal to val. */ bool all_equal(not_void_T val) const { bool all_equal = true; for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; }); return all_equal; } Buffer &fill(not_void_T val) { set_host_dirty(); for_each_value([=](T &v) { v = val; }); return *this; } private: /** Helper functions for for_each_value. */ // @{ template struct for_each_value_task_dim { int extent; int stride[N]; }; // Given an array of strides, and a bunch of pointers to pointers // (all of different types), advance the pointers using the // strides. template HALIDE_ALWAYS_INLINE static void advance_ptrs(const int *stride, Ptr *ptr, Ptrs... ptrs) { (*ptr) += *stride; advance_ptrs(stride + 1, ptrs...); } HALIDE_ALWAYS_INLINE static void advance_ptrs(const int *) { } // Same as the above, but just increments the pointers. template HALIDE_ALWAYS_INLINE static void increment_ptrs(Ptr *ptr, Ptrs... ptrs) { (*ptr)++; increment_ptrs(ptrs...); } HALIDE_ALWAYS_INLINE static void increment_ptrs() { } template HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one, const for_each_value_task_dim *t, Ptrs... ptrs) { if (d == -1) { f((*ptrs)...); } else if (d == 0) { if (innermost_strides_are_one) { for (int i = t[0].extent; i != 0; i--) { f((*ptrs)...); increment_ptrs((&ptrs)...); } } else { for (int i = t[0].extent; i != 0; i--) { f((*ptrs)...); advance_ptrs(t[0].stride, (&ptrs)...); } } } else { for (int i = t[d].extent; i != 0; i--) { for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptrs...); advance_ptrs(t[d].stride, (&ptrs)...); } } } template HALIDE_NEVER_INLINE static bool for_each_value_prep(for_each_value_task_dim *t, const halide_buffer_t **buffers) { const int dimensions = buffers[0]->dimensions; // Extract the strides in all the dimensions for (int i = 0; i < dimensions; i++) { for (int j = 0; j < N; j++) { assert(buffers[j]->dimensions == dimensions); assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent && buffers[j]->dim[i].min == buffers[0]->dim[i].min); const int s = buffers[j]->dim[i].stride; t[i].stride[j] = s; } t[i].extent = buffers[0]->dim[i].extent; // Order the dimensions by stride, so that the traversal is cache-coherent. for (int j = i; j > 0 && t[j].stride[0] < t[j - 1].stride[0]; j--) { std::swap(t[j], t[j - 1]); } } // flatten dimensions where possible to make a larger inner // loop for autovectorization. int d = dimensions; for (int i = 1; i < d; i++) { bool flat = true; for (int j = 0; j < N; j++) { flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j]; } if (flat) { t[i - 1].extent *= t[i].extent; for (int j = i; j < d; j++) { t[j] = t[j + 1]; } i--; d--; t[d].extent = 1; } } bool innermost_strides_are_one = true; if (dimensions > 0) { for (int i = 0; i < N; i++) { innermost_strides_are_one &= (t[0].stride[i] == 1); } } return innermost_strides_are_one; } template void for_each_value_impl(Fn &&f, Args &&... other_buffers) const { Buffer<>::for_each_value_task_dim *t = (Buffer<>::for_each_value_task_dim *)HALIDE_ALLOCA((dimensions() + 1) * sizeof(for_each_value_task_dim)); // Move the preparatory code into a non-templated helper to // save code size. const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...}; bool innermost_strides_are_one = Buffer<>::for_each_value_prep(t, buffers); Buffer<>::for_each_value_helper(f, dimensions() - 1, innermost_strides_are_one, t, data(), (other_buffers.data())...); } // @} public: /** Call a function on every value in the buffer, and the * corresponding values in some number of other buffers of the * same size. The function should take a reference, const * reference, or value of the correct type for each buffer. This * effectively lifts a function of scalars to an element-wise * function of buffers. This produces code that the compiler can * autovectorize. This is slightly cheaper than for_each_element, * because it does not need to track the coordinates. * * Note that constness of Buffers is preserved: a const Buffer (for either * 'this' or the other-buffers arguments) will allow mutation of the * buffer contents, while a Buffer will not. Attempting to specify * a mutable reference for the lambda argument of a Buffer * will result in a compilation error. */ // @{ template HALIDE_ALWAYS_INLINE const Buffer &for_each_value(Fn &&f, Args &&... other_buffers) const { for_each_value_impl(f, std::forward(other_buffers)...); return *this; } template HALIDE_ALWAYS_INLINE Buffer & for_each_value(Fn &&f, Args &&... other_buffers) { for_each_value_impl(f, std::forward(other_buffers)...); return *this; } // @} private: // Helper functions for for_each_element struct for_each_element_task_dim { int min, max; }; /** If f is callable with this many args, call it. The first * argument is just to make the overloads distinct. Actual * overload selection is done using the enable_if. */ template()(std::declval()...))> HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) { f(args...); } /** If the above overload is impossible, we add an outer loop over * an additional argument and try again. */ template HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) { for (int i = t[d].min; i <= t[d].max; i++) { for_each_element_variadic(0, d - 1, t, std::forward(f), i, args...); } } /** Determine the minimum number of arguments a callable can take * using the same trick. */ template()(std::declval()...))> HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) { return (int)(sizeof...(Args)); } /** The recursive version is only enabled up to a recursion limit * of 256. This catches callables that aren't callable with any * number of ints. */ template HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) { static_assert(sizeof...(args) <= 256, "Callable passed to for_each_element must accept either a const int *," " or up to 256 ints. No such operator found. Expect infinite template recursion."); return num_args(0, std::forward(f), 0, args...); } /** A version where the callable takes a position array instead, * with compile-time recursion on the dimensionality. This * overload is preferred to the one below using the same int vs * double trick as above, but is impossible once d hits -1 using * std::enable_if. */ template= 0)>::type> HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) { for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) { for_each_element_array_helper(0, t, std::forward(f), pos); } } /** Base case for recursion above. */ template::type> HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) { f(pos); } /** A run-time-recursive version (instead of * compile-time-recursive) that requires the callable to take a * pointer to a position array instead. Dispatches to the * compile-time-recursive version once the dimensionality gets * small. */ template static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) { if (d == -1) { f(pos); } else if (d == 0) { // Once the dimensionality gets small enough, dispatch to // a compile-time-recursive version for better codegen of // the inner loops. for_each_element_array_helper<0, Fn>(0, t, std::forward(f), pos); } else if (d == 1) { for_each_element_array_helper<1, Fn>(0, t, std::forward(f), pos); } else if (d == 2) { for_each_element_array_helper<2, Fn>(0, t, std::forward(f), pos); } else if (d == 3) { for_each_element_array_helper<3, Fn>(0, t, std::forward(f), pos); } else { for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) { for_each_element_array(d - 1, t, std::forward(f), pos); } } } /** We now have two overloads for for_each_element. This one * triggers if the callable takes a const int *. */ template()((const int *)nullptr))> static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) { int *pos = (int *)HALIDE_ALLOCA(dims * sizeof(int)); for_each_element_array(dims - 1, t, std::forward(f), pos); } /** This one triggers otherwise. It treats the callable as * something that takes some number of ints. */ template HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) { int args = num_args(0, std::forward(f)); assert(dims >= args); for_each_element_variadic(0, args - 1, t, std::forward(f)); } template void for_each_element_impl(Fn &&f) const { for_each_element_task_dim *t = (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim)); for (int i = 0; i < dimensions(); i++) { t[i].min = dim(i).min(); t[i].max = dim(i).max(); } for_each_element(0, dimensions(), t, std::forward(f)); } public: /** Call a function at each site in a buffer. This is likely to be * much slower than using Halide code to populate a buffer, but is * convenient for tests. If the function has more arguments than the * buffer has dimensions, the remaining arguments will be zero. If it * has fewer arguments than the buffer has dimensions then the last * few dimensions of the buffer are not iterated over. For example, * the following code exploits this to set a floating point RGB image * to red: \code Buffer im(100, 100, 3); im.for_each_element([&](int x, int y) { im(x, y, 0) = 1.0f; im(x, y, 1) = 0.0f; im(x, y, 2) = 0.0f: }); \endcode * The compiled code is equivalent to writing the a nested for loop, * and compilers are capable of optimizing it in the same way. * * If the callable can be called with an int * as the sole argument, * that version is called instead. Each location in the buffer is * passed to it in a coordinate array. This version is higher-overhead * than the variadic version, but is useful for writing generic code * that accepts buffers of arbitrary dimensionality. For example, the * following sets the value at all sites in an arbitrary-dimensional * buffer to their first coordinate: \code im.for_each_element([&](const int *pos) {im(pos) = pos[0];}); \endcode * It is also possible to use for_each_element to iterate over entire * rows or columns by cropping the buffer to a single column or row * respectively and iterating over elements of the result. For example, * to set the diagonal of the image to 1 by iterating over the columns: \code Buffer im(100, 100, 3); im.sliced(1, 0).for_each_element([&](int x, int c) { im(x, x, c) = 1.0f; }); \endcode * Or, assuming the memory layout is known to be dense per row, one can * memset each row of an image like so: \code Buffer im(100, 100, 3); im.sliced(0, 0).for_each_element([&](int y, int c) { memset(&im(0, y, c), 0, sizeof(float) * im.width()); }); \endcode */ // @{ template HALIDE_ALWAYS_INLINE const Buffer &for_each_element(Fn &&f) const { for_each_element_impl(f); return *this; } template HALIDE_ALWAYS_INLINE Buffer & for_each_element(Fn &&f) { for_each_element_impl(f); return *this; } // @} private: template struct FillHelper { Fn f; Buffer *buf; template()(std::declval()...))> void operator()(Args... args) { (*buf)(args...) = f(args...); } FillHelper(Fn &&f, Buffer *buf) : f(std::forward(f)), buf(buf) { } }; public: /** Fill a buffer by evaluating a callable at every site. The * callable should look much like a callable passed to * for_each_element, but it should return the value that should be * stored to the coordinate corresponding to the arguments. */ template::type>::value>::type> Buffer &fill(Fn &&f) { // We'll go via for_each_element. We need a variadic wrapper lambda. FillHelper wrapper(std::forward(f), this); return for_each_element(wrapper); } /** Check if an input buffer passed extern stage is a querying * bounds. Compared to doing the host pointer check directly, * this both adds clarity to code and will facilitate moving to * another representation for bounds query arguments. */ bool is_bounds_query() const { return buf.is_bounds_query(); } /** Convenient check to verify that all of the interesting bytes in the Buffer * are initialized under MSAN. Note that by default, we use for_each_value() here so that * we skip any unused padding that isn't part of the Buffer; this isn't efficient, * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check * the entire Buffer storage.) */ void msan_check_mem_is_initialized(bool entire = false) const { #if defined(__has_feature) #if __has_feature(memory_sanitizer) if (entire) { __msan_check_mem_is_initialized(data(), size_in_bytes()); } else { for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; }); } #endif #endif } }; } // namespace Runtime } // namespace Halide #undef HALIDE_ALLOCA #endif // HALIDE_RUNTIME_IMAGE_H namespace Halide { template class Buffer; namespace Internal { struct BufferContents { mutable RefCount ref_count; std::string name; Runtime::Buffer<> buf; }; Expr buffer_accessor(const Buffer<> &buf, const std::vector &args); template struct all_ints_and_optional_name : std::false_type {}; template struct all_ints_and_optional_name : meta_and, all_ints_and_optional_name> {}; template struct all_ints_and_optional_name : meta_or, std::is_convertible> {}; template<> struct all_ints_and_optional_name<> : std::true_type {}; template::value>::type> std::string get_name_from_end_of_parameter_pack(T &&) { return ""; } inline std::string get_name_from_end_of_parameter_pack(const std::string &n) { return n; } inline std::string get_name_from_end_of_parameter_pack() { return ""; } template std::string get_name_from_end_of_parameter_pack(First first, Second second, Args &&... rest) { return get_name_from_end_of_parameter_pack(second, std::forward(rest)...); } inline void get_shape_from_start_of_parameter_pack_helper(std::vector &, const std::string &) { } inline void get_shape_from_start_of_parameter_pack_helper(std::vector &) { } template void get_shape_from_start_of_parameter_pack_helper(std::vector &result, int x, Args &&... rest) { result.push_back(x); get_shape_from_start_of_parameter_pack_helper(result, std::forward(rest)...); } template std::vector get_shape_from_start_of_parameter_pack(Args &&... args) { std::vector result; get_shape_from_start_of_parameter_pack_helper(result, std::forward(args)...); return result; } template using add_const_if_T_is_const = typename std::conditional::value, const T2, T2>::type; } // namespace Internal /** A Halide::Buffer is a named shared reference to a * Halide::Runtime::Buffer. * * A Buffer can refer to a Buffer if T1 is const whenever T2 * is const, and either T1 = T2 or T1 is void. A Buffer can * refer to any Buffer of any non-const type, and the default * template parameter is T = void. */ template class Buffer { Internal::IntrusivePtr contents; template friend class Buffer; template static void assert_can_convert_from(const Buffer &other) { if (!other.defined()) { // Avoid UB of deferencing offset of a null contents ptr static_assert((!std::is_const::value || std::is_const::value), "Can't convert from a Buffer to a Buffer"); static_assert(std::is_same::type, typename std::remove_const::type>::value || std::is_void::value || std::is_void::value, "type mismatch constructing Buffer"); } else { Runtime::Buffer::assert_can_convert_from(*(other.get())); } } public: typedef T ElemType; // This class isn't final (and is subclassed from the Python binding // code, at least) so it needs a virtual dtor. virtual ~Buffer() = default; /** Make a null Buffer, which points to no Runtime::Buffer */ Buffer() = default; /** Trivial copy constructor. */ Buffer(const Buffer &that) = default; /** Trivial copy assignment operator. */ Buffer &operator=(const Buffer &that) = default; /** Trivial move assignment operator. */ Buffer &operator=(Buffer &&) noexcept = default; /** Make a Buffer from a Buffer of a different type */ template Buffer(const Buffer &other) : contents(other.contents) { assert_can_convert_from(other); } /** Move construct from a Buffer of a different type */ template Buffer(Buffer &&other) noexcept { assert_can_convert_from(other); contents = std::move(other.contents); } /** Construct a Buffer that captures and owns an rvalue Runtime::Buffer */ template Buffer(Runtime::Buffer &&buf, const std::string &name = "") : contents(new Internal::BufferContents) { contents->buf = std::move(buf); if (name.empty()) { contents->name = Internal::make_entity_name(this, "Halide:.*:Buffer<.*>", 'b'); } else { contents->name = name; } } /** Constructors that match Runtime::Buffer with two differences: * 1) They take a Type instead of a halide_type_t * 2) There is an optional last string argument that gives the buffer a specific name */ // @{ template::value>::type> explicit Buffer(Type t, int first, Args... rest) : Buffer(Runtime::Buffer(t, Internal::get_shape_from_start_of_parameter_pack(first, rest...)), Internal::get_name_from_end_of_parameter_pack(rest...)) { } explicit Buffer(const halide_buffer_t &buf, const std::string &name = "") : Buffer(Runtime::Buffer(buf), name) { } template::value>::type> explicit Buffer(int first, Args... rest) : Buffer(Runtime::Buffer(Internal::get_shape_from_start_of_parameter_pack(first, rest...)), Internal::get_name_from_end_of_parameter_pack(rest...)) { } explicit Buffer(Type t, const std::vector &sizes, const std::string &name = "") : Buffer(Runtime::Buffer(t, sizes), name) { } explicit Buffer(Type t, const std::vector &sizes, const std::vector &storage_order, const std::string &name = "") : Buffer(Runtime::Buffer(t, sizes, storage_order), name) { } explicit Buffer(const std::vector &sizes, const std::string &name = "") : Buffer(Runtime::Buffer(sizes), name) { } explicit Buffer(const std::vector &sizes, const std::vector &storage_order, const std::string &name = "") : Buffer(Runtime::Buffer(sizes, storage_order), name) { } template explicit Buffer(Array (&vals)[N], const std::string &name = "") : Buffer(Runtime::Buffer(vals), name) { } template::value>::type> explicit Buffer(Type t, Internal::add_const_if_T_is_const *data, int first, Args &&... rest) : Buffer(Runtime::Buffer(t, data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)), Internal::get_name_from_end_of_parameter_pack(rest...)) { } template::value>::type> explicit Buffer(Type t, Internal::add_const_if_T_is_const *data, const std::vector &sizes, const std::string &name = "") : Buffer(Runtime::Buffer(t, data, sizes, name)) { } template::value>::type> explicit Buffer(T *data, int first, Args &&... rest) : Buffer(Runtime::Buffer(data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)), Internal::get_name_from_end_of_parameter_pack(rest...)) { } explicit Buffer(T *data, const std::vector &sizes, const std::string &name = "") : Buffer(Runtime::Buffer(data, sizes), name) { } explicit Buffer(Type t, Internal::add_const_if_T_is_const *data, const std::vector &sizes, const std::string &name = "") : Buffer(Runtime::Buffer(t, data, sizes), name) { } explicit Buffer(Type t, Internal::add_const_if_T_is_const *data, int d, const halide_dimension_t *shape, const std::string &name = "") : Buffer(Runtime::Buffer(t, data, d, shape), name) { } explicit Buffer(T *data, int d, const halide_dimension_t *shape, const std::string &name = "") : Buffer(Runtime::Buffer(data, d, shape), name) { } static Buffer make_scalar(const std::string &name = "") { return Buffer(Runtime::Buffer::make_scalar(), name); } static Buffer<> make_scalar(Type t, const std::string &name = "") { return Buffer<>(Runtime::Buffer<>::make_scalar(t), name); } static Buffer make_scalar(T *data, const std::string &name = "") { return Buffer(Runtime::Buffer::make_scalar(data), name); } static Buffer make_interleaved(int width, int height, int channels, const std::string &name = "") { return Buffer(Runtime::Buffer::make_interleaved(width, height, channels), name); } static Buffer<> make_interleaved(Type t, int width, int height, int channels, const std::string &name = "") { return Buffer<>(Runtime::Buffer<>::make_interleaved(t, width, height, channels), name); } static Buffer make_interleaved(T *data, int width, int height, int channels, const std::string &name = "") { return Buffer(Runtime::Buffer::make_interleaved(data, width, height, channels), name); } static Buffer> make_interleaved(Type t, T *data, int width, int height, int channels, const std::string &name = "") { using T2 = Internal::add_const_if_T_is_const; return Buffer(Runtime::Buffer::make_interleaved(t, data, width, height, channels), name); } template static Buffer make_with_shape_of(Buffer src, void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr, const std::string &name = "") { return Buffer(Runtime::Buffer::make_with_shape_of(*src.get(), allocate_fn, deallocate_fn), name); } template static Buffer make_with_shape_of(const Runtime::Buffer &src, void *(*allocate_fn)(size_t) = nullptr, void (*deallocate_fn)(void *) = nullptr, const std::string &name = "") { return Buffer(Runtime::Buffer::make_with_shape_of(src, allocate_fn, deallocate_fn), name); } // @} /** Buffers are optionally named. */ // @{ void set_name(const std::string &n) { contents->name = n; } const std::string &name() const { return contents->name; } // @} /** Check if two Buffer objects point to the same underlying Buffer */ template bool same_as(const Buffer &other) { return (const void *)(contents.get()) == (const void *)(other.contents.get()); } /** Check if this Buffer refers to an existing * Buffer. Default-constructed Buffer objects do not refer to any * existing Buffer. */ bool defined() const { return contents.defined(); } /** Get a pointer to the underlying Runtime::Buffer */ // @{ Runtime::Buffer *get() { // It's already type-checked, so no need to use as. return (Runtime::Buffer *)(&contents->buf); } const Runtime::Buffer *get() const { return (const Runtime::Buffer *)(&contents->buf); } // @} public: // We forward numerous methods from the underlying Buffer #define HALIDE_BUFFER_FORWARD_CONST(method) \ template \ auto method(Args &&... args) const->decltype(std::declval>().method(std::forward(args)...)) { \ user_assert(defined()) << "Undefined buffer calling const method " #method "\n"; \ return get()->method(std::forward(args)...); \ } #define HALIDE_BUFFER_FORWARD(method) \ template \ auto method(Args &&... args)->decltype(std::declval>().method(std::forward(args)...)) { \ user_assert(defined()) << "Undefined buffer calling method " #method "\n"; \ return get()->method(std::forward(args)...); \ } // This is a weird-looking but effective workaround for a deficiency in "perfect forwarding": // namely, it can't really handle initializer-lists. The idea here is that we declare // the expected type to be passed on, and that allows the compiler to handle it. // The weirdness comes in with the variadic macro: the problem is that the type // we want to forward might be something like `std::vector>`, // which contains a comma, which throws a big wrench in C++ macro system. // However... since all we really need to do is capture the remainder of the macro, // and forward it as is, we can just use ... to allow an arbitrary number of commas, // then use __VA_ARGS__ to forward the mess as-is, and while it looks horrible, it // works. #define HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(method, ...) \ inline auto method(const __VA_ARGS__ &a)->decltype(std::declval>().method(a)) { \ user_assert(defined()) << "Undefined buffer calling method " #method "\n"; \ return get()->method(a); \ } /** Does the same thing as the equivalent Halide::Runtime::Buffer method */ // @{ HALIDE_BUFFER_FORWARD(raw_buffer) HALIDE_BUFFER_FORWARD_CONST(raw_buffer) HALIDE_BUFFER_FORWARD_CONST(dimensions) HALIDE_BUFFER_FORWARD_CONST(dim) HALIDE_BUFFER_FORWARD_CONST(width) HALIDE_BUFFER_FORWARD_CONST(height) HALIDE_BUFFER_FORWARD_CONST(channels) HALIDE_BUFFER_FORWARD_CONST(min) HALIDE_BUFFER_FORWARD_CONST(extent) HALIDE_BUFFER_FORWARD_CONST(stride) HALIDE_BUFFER_FORWARD_CONST(left) HALIDE_BUFFER_FORWARD_CONST(right) HALIDE_BUFFER_FORWARD_CONST(top) HALIDE_BUFFER_FORWARD_CONST(bottom) HALIDE_BUFFER_FORWARD_CONST(number_of_elements) HALIDE_BUFFER_FORWARD_CONST(size_in_bytes) HALIDE_BUFFER_FORWARD_CONST(begin) HALIDE_BUFFER_FORWARD_CONST(end) HALIDE_BUFFER_FORWARD(data) HALIDE_BUFFER_FORWARD_CONST(data) HALIDE_BUFFER_FORWARD_CONST(contains) HALIDE_BUFFER_FORWARD(crop) HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(crop, std::vector>) HALIDE_BUFFER_FORWARD(slice) HALIDE_BUFFER_FORWARD_CONST(sliced) HALIDE_BUFFER_FORWARD(embed) HALIDE_BUFFER_FORWARD_CONST(embedded) HALIDE_BUFFER_FORWARD(set_min) HALIDE_BUFFER_FORWARD(translate) HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(translate, std::vector) HALIDE_BUFFER_FORWARD(transpose) HALIDE_BUFFER_FORWARD_CONST(transposed) HALIDE_BUFFER_FORWARD(add_dimension) HALIDE_BUFFER_FORWARD(copy_to_host) HALIDE_BUFFER_FORWARD(copy_to_device) HALIDE_BUFFER_FORWARD_CONST(has_device_allocation) HALIDE_BUFFER_FORWARD_CONST(host_dirty) HALIDE_BUFFER_FORWARD_CONST(device_dirty) HALIDE_BUFFER_FORWARD(set_host_dirty) HALIDE_BUFFER_FORWARD(set_device_dirty) HALIDE_BUFFER_FORWARD(device_sync) HALIDE_BUFFER_FORWARD(device_malloc) HALIDE_BUFFER_FORWARD(device_wrap_native) HALIDE_BUFFER_FORWARD(device_detach_native) HALIDE_BUFFER_FORWARD(allocate) HALIDE_BUFFER_FORWARD(deallocate) HALIDE_BUFFER_FORWARD(device_deallocate) HALIDE_BUFFER_FORWARD(device_free) HALIDE_BUFFER_FORWARD_CONST(all_equal) #undef HALIDE_BUFFER_FORWARD #undef HALIDE_BUFFER_FORWARD_CONST template Buffer &for_each_value(Fn &&f, Args... other_buffers) { get()->for_each_value(std::forward(f), (*std::forward(other_buffers).get())...); return *this; } template const Buffer &for_each_value(Fn &&f, Args... other_buffers) const { get()->for_each_value(std::forward(f), (*std::forward(other_buffers).get())...); return *this; } template Buffer &for_each_element(Fn &&f) { get()->for_each_element(std::forward(f)); return *this; } template const Buffer &for_each_element(Fn &&f) const { get()->for_each_element(std::forward(f)); return *this; } template Buffer &fill(FnOrValue &&f) { get()->fill(std::forward(f)); return *this; } static constexpr bool has_static_halide_type = Runtime::Buffer::has_static_halide_type; static halide_type_t static_halide_type() { return Runtime::Buffer::static_halide_type(); } template static bool can_convert_from(const Buffer &other) { return Halide::Runtime::Buffer::can_convert_from(*other.get()); } // Note that since Runtime::Buffer stores halide_type_t rather than Halide::Type, // there is no handle-specific type information, so all handle types are // considered equivalent to void* here. (This only matters if you are making // a Buffer-of-handles, which is not really a real use case...) Type type() const { return contents->buf.type(); } template Buffer as() const { return Buffer(*this); } Buffer copy() const { return Buffer(std::move(contents->buf.as().copy())); } template void copy_from(const Buffer &other) { contents->buf.copy_from(*other.get()); } template auto operator()(int first, Args &&... args) -> decltype(std::declval>()(first, std::forward(args)...)) { return (*get())(first, std::forward(args)...); } template auto operator()(int first, Args &&... args) const -> decltype(std::declval>()(first, std::forward(args)...)) { return (*get())(first, std::forward(args)...); } auto operator()(const int *pos) -> decltype(std::declval>()(pos)) { return (*get())(pos); } auto operator()(const int *pos) const -> decltype(std::declval>()(pos)) { return (*get())(pos); } auto operator()() -> decltype(std::declval>()()) { return (*get())(); } auto operator()() const -> decltype(std::declval>()()) { return (*get())(); } // @} /** Make an Expr that loads from this concrete buffer at a computed coordinate. */ // @{ template Expr operator()(const Expr &first, Args... rest) const { std::vector args = {first, rest...}; return (*this)(args); }; template Expr operator()(const std::vector &args) const { return buffer_accessor(Buffer<>(*this), args); }; // @} /** Copy to the GPU, using the device API that is the default for the given Target. */ int copy_to_device(const Target &t = get_jit_target_from_environment()) { return copy_to_device(DeviceAPI::Default_GPU, t); } /** Copy to the GPU, using the given device API */ int copy_to_device(const DeviceAPI &d, const Target &t = get_jit_target_from_environment()) { return contents->buf.copy_to_device(get_device_interface_for_device_api(d, t, "Buffer::copy_to_device")); } /** Allocate on the GPU, using the device API that is the default for the given Target. */ int device_malloc(const Target &t = get_jit_target_from_environment()) { return device_malloc(DeviceAPI::Default_GPU, t); } /** Allocate storage on the GPU, using the given device API */ int device_malloc(const DeviceAPI &d, const Target &t = get_jit_target_from_environment()) { return contents->buf.device_malloc(get_device_interface_for_device_api(d, t, "Buffer::device_malloc")); } /** Wrap a native handle, using the given device API. * It is a bad idea to pass DeviceAPI::Default_GPU to this routine * as the handle argument must match the API that the default * resolves to and it is clearer and more reliable to pass the * resolved DeviceAPI explicitly. */ int device_wrap_native(const DeviceAPI &d, uint64_t handle, const Target &t = get_jit_target_from_environment()) { return contents->buf.device_wrap_native(get_device_interface_for_device_api(d, t, "Buffer::device_wrap_native"), handle); } }; } // namespace Halide #endif namespace Halide { /** An argument to an extern-defined Func. May be a Function, Buffer, * ImageParam or Expr. */ struct ExternFuncArgument { enum ArgType { UndefinedArg = 0, FuncArg, BufferArg, ExprArg, ImageParamArg }; ArgType arg_type; Internal::FunctionPtr func; Buffer<> buffer; Expr expr; Internal::Parameter image_param; ExternFuncArgument(Internal::FunctionPtr f) : arg_type(FuncArg), func(std::move(f)) { } template ExternFuncArgument(Buffer b) : arg_type(BufferArg), buffer(b) { } ExternFuncArgument(Expr e) : arg_type(ExprArg), expr(std::move(e)) { } ExternFuncArgument(int e) : arg_type(ExprArg), expr(e) { } ExternFuncArgument(float e) : arg_type(ExprArg), expr(e) { } ExternFuncArgument(const Internal::Parameter &p) : arg_type(ImageParamArg), image_param(p) { // Scalar params come in via the Expr constructor. internal_assert(p.is_buffer()); } ExternFuncArgument() : arg_type(UndefinedArg) { } bool is_func() const { return arg_type == FuncArg; } bool is_expr() const { return arg_type == ExprArg; } bool is_buffer() const { return arg_type == BufferArg; } bool is_image_param() const { return arg_type == ImageParamArg; } bool defined() const { return arg_type != UndefinedArg; } }; } // namespace Halide #endif // HALIDE_EXTERNFUNCARGUMENT_H #ifndef HALIDE_IR_H #define HALIDE_IR_H /** \file * Subtypes for Halide expressions (\ref Halide::Expr) and statements (\ref Halide::Internal::Stmt) */ #include #include #ifndef HALIDE_REDUCTION_H #define HALIDE_REDUCTION_H /** \file * Defines internal classes related to Reduction Domains */ namespace Halide { namespace Internal { class IRMutator; /** A single named dimension of a reduction domain */ struct ReductionVariable { std::string var; Expr min, extent; /** This lets you use a ReductionVariable as a key in a map of the form * map */ struct Compare { bool operator()(const ReductionVariable &a, const ReductionVariable &b) const { return a.var < b.var; } }; }; struct ReductionDomainContents; /** A reference-counted handle on a reduction domain, which is just a * vector of ReductionVariable. */ class ReductionDomain { IntrusivePtr contents; public: /** This lets you use a ReductionDomain as a key in a map of the form * map */ struct Compare { bool operator()(const ReductionDomain &a, const ReductionDomain &b) const { internal_assert(a.contents.defined() && b.contents.defined()); return a.contents < b.contents; } }; /** Construct a new nullptr reduction domain */ ReductionDomain() : contents(nullptr) { } /** Construct a reduction domain that spans the outer product of * all values of the given ReductionVariable in scanline order, * with the start of the vector being innermost, and the end of * the vector being outermost. */ ReductionDomain(const std::vector &domain); /** Return a deep copy of this ReductionDomain. */ ReductionDomain deep_copy() const; /** Is this handle non-nullptr */ bool defined() const { return contents.defined(); } /** Tests for equality of reference. Only one reduction domain is * allowed per reduction function, and this is used to verify * that */ bool same_as(const ReductionDomain &other) const { return contents.same_as(other.contents); } /** Immutable access to the reduction variables. */ const std::vector &domain() const; /** Add predicate to the reduction domain. See \ref RDom::where * for more details. */ void where(Expr predicate); /** Return the predicate defined on this reducation demain. */ Expr predicate() const; /** Set the predicate, replacing any previously set predicate. */ void set_predicate(const Expr &); /** Split predicate into vector of ANDs. If there is no predicate (i.e. all * iteration domain in this reduction domain is valid), this returns an * empty vector. */ std::vector split_predicate() const; /** Mark RDom as frozen, which means it cannot accept new predicates. An * RDom is frozen once it is used in a Func's update definition. */ void freeze(); /** Check if a RDom has been frozen. If so, it is an error to add new * predicates. */ bool frozen() const; /** Pass an IRVisitor through to all Exprs referenced in the * ReductionDomain. */ void accept(IRVisitor *) const; /** Pass an IRMutator through to all Exprs referenced in the * ReductionDomain. */ void mutate(IRMutator *); }; void split_predicate_test(); } // namespace Internal } // namespace Halide #endif namespace Halide { namespace Internal { class Function; /** The actual IR nodes begin here. Remember that all the Expr * nodes also have a public "type" property */ /** Cast a node from one type to another. Can't change vector widths. */ struct Cast : public ExprNode { Expr value; static Expr make(Type t, Expr v); static const IRNodeType _node_type = IRNodeType::Cast; }; /** The sum of two expressions */ struct Add : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Add; }; /** The difference of two expressions */ struct Sub : public ExprNode_{{ Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Sub; }; /** The product of two expressions */ struct Mul : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Mul; }; /** The ratio of two expressions */ struct Div : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Div; }; /** The remainder of a / b. Mostly equivalent to '%' in C, except that * the result here is always positive. For floats, this is equivalent * to calling fmod. */ struct Mod : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Mod; }; /** The lesser of two values. */ struct Min : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Min; }; /** The greater of two values */ struct Max : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Max; }; /** Is the first expression equal to the second */ struct EQ : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::EQ; }; /** Is the first expression not equal to the second */ struct NE : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::NE; }; /** Is the first expression less than the second. */ struct LT : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::LT; }; /** Is the first expression less than or equal to the second. */ struct LE : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::LE; }; /** Is the first expression greater than the second. */ struct GT : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::GT; }; /** Is the first expression greater than or equal to the second. */ struct GE : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::GE; }; /** Logical and - are both expressions true */ struct And : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::And; }; /** Logical or - is at least one of the expression true */ struct Or : public ExprNode { Expr a, b; static Expr make(Expr a, Expr b); static const IRNodeType _node_type = IRNodeType::Or; }; /** Logical not - true if the expression false */ struct Not : public ExprNode { Expr a; static Expr make(Expr a); static const IRNodeType _node_type = IRNodeType::Not; }; /** A ternary operator. Evalutes 'true_value' and 'false_value', * then selects between them based on 'condition'. Equivalent to * the ternary operator in C. */ struct Select : public ExprNode}