This commit is contained in:
√(noham)² 2024-07-26 16:00:08 +02:00
parent 90e2dadf18
commit c9c3228b2c
7501 changed files with 3266200 additions and 1 deletions

2
.gitignore vendored
View File

@ -157,7 +157,7 @@ node_modules/
VoTT-v2-none.zip
VoTT-v2-done.zip
VoTT-v2/output/vott-json-export.zip
/OpenCV/opencv
# /OpenCV/opencv
VoTT-v2/vids/Every 100m World Lead Since 2009. \[JoZOpTWBI1E\].mp4
VoTT-v2/vids/World Record - 400m Men Final Sevilla 1999 \[udlrRIYBUsw\].mp4
upscale/400m_alq13.mp4

1
OpenCV/opencv/3rdparty/.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
* -whitespace

View File

@ -0,0 +1,8 @@
# Gedit temp files
*~
# Qt Creator file
*.user
# MacOS-specific (Desktop Services Store)
.DS_Store

View File

@ -0,0 +1,60 @@
cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
project(Carotene)
set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
set(CAROTENE_INCLUDE_DIR include)
set(CAROTENE_SOURCE_DIR src)
file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
"${CAROTENE_SOURCE_DIR}/*.hpp")
include_directories(${CAROTENE_INCLUDE_DIR})
if(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
# allow more inlines - these parameters improve performance for:
# - matchTemplate about 5-10%
# - goodFeaturesToTrack 10-20%
# - cornerHarris 30% for some cases
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "10.0.0")
set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
else()
set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipa-cp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
endif()
if(APPLE AND CV_CLANG AND WITH_NEON)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-unused-function)
endif()
add_library(carotene_objs OBJECT EXCLUDE_FROM_ALL
${carotene_headers}
${carotene_sources}
)
if(NOT CAROTENE_NS STREQUAL "carotene")
target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
endif()
if(WITH_NEON)
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
if(NOT DEFINED CAROTENE_NEON_ARCH )
elseif(CAROTENE_NEON_ARCH EQUAL 8)
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
elseif(CAROTENE_NEON_ARCH EQUAL 7)
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
else()
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
endif()
endif()
if(MINGW)
target_compile_definitions(carotene_objs PRIVATE "-D_USE_MATH_DEFINES=1")
endif()
# we add dummy file to fix XCode build
add_library(carotene STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} "$<TARGET_OBJECTS:carotene_objs>" "${CAROTENE_SOURCE_DIR}/dummy.cpp")

View File

@ -0,0 +1,2 @@
This is Carotene, a low-level library containing optimized CPU routines
that are useful for computer vision algorithms.

View File

@ -0,0 +1,102 @@
cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM TRUE)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
set(AARCH64 TRUE)
endif()
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-function)
set(TEGRA_COMPILER_FLAGS "")
if(CV_GCC OR CV_CLANG)
# Generate unwind information even for functions that can't throw/propagate exceptions.
# This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
endif()
if(CV_GCC OR CV_CLANG)
if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
elseif(CV_CLANG)
list(APPEND TEGRA_COMPILER_FLAGS -fwrapv)
else()
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
-fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
endif()
if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
-floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
endif()
endif()
string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
if(ARMEABI_V7A)
if(CV_GCC OR CV_CLANG)
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
endif()
endif()
if(WITH_LOGS)
add_definitions(-DHAVE_LOGS)
endif()
set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
function(compile_carotene)
if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
set(WITH_NEON ON)
endif()
add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
endfunction()
compile_carotene()
include_directories("${CAROTENE_DIR}/include")
get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
if(CV_GCC)
# allow more inlines - these parameters improve performance for:
# matchTemplate about 5-10%
# goodFeaturesToTrack 10-20%
# cornerHarris 30% for some cases
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "10.0.0")
set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
else()
set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipa-cp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
# set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
# we add dummy file to fix XCode build
add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs> "dummy.cpp")
set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
endif()
target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)

View File

@ -0,0 +1,2 @@
// This file is needed for compilation on some platforms e.g. with XCode generator
// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,47 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_DEFINITIONS_HPP
#define CAROTENE_DEFINITIONS_HPP
#ifndef CAROTENE_NS
#define CAROTENE_NS carotene
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,125 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_TYPES_HPP
#define CAROTENE_TYPES_HPP
#include <carotene/definitions.hpp>
#include <stdint.h>
#include <cstddef>
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
namespace CAROTENE_NS {
using std::size_t;
using std::ptrdiff_t;
typedef int8_t s8;
typedef uint8_t u8;
typedef int16_t s16;
typedef uint16_t u16;
typedef int32_t s32;
typedef uint32_t u32;
typedef float f32;
typedef int64_t s64;
typedef uint64_t u64;
typedef double f64;
typedef ptrdiff_t stride_t;
enum CONVERT_POLICY
{
CONVERT_POLICY_WRAP,
CONVERT_POLICY_SATURATE
};
enum BORDER_MODE
{
BORDER_MODE_UNDEFINED,
BORDER_MODE_CONSTANT,
BORDER_MODE_REPLICATE,
BORDER_MODE_REFLECT,
BORDER_MODE_REFLECT101,
BORDER_MODE_WRAP
};
enum FLIP_MODE
{
FLIP_HORIZONTAL_MODE = 1,
FLIP_VERTICAL_MODE = 2,
FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
};
enum COLOR_SPACE
{
COLOR_SPACE_BT601,
COLOR_SPACE_BT709
};
struct Size2D {
Size2D() : width(0), height(0) {}
Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
size_t width;
size_t height;
inline size_t total() const
{
return width * height;
}
};
struct Margin {
Margin() : left(0), right(0), top(0), bottom(0) {}
Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
: left(left_), right(right_), top(top_), bottom(bottom_) {}
// these are measured in elements
size_t left, right, top, bottom;
};
struct KeypointStore {
virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
virtual ~KeypointStore() {};
};
}
#endif

View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct AbsDiff
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vabdq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vabd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
}
};
template <typename T>
struct AbsDiffSigned
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
v_dst = internal::vqsubq(v_max, v_min);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
v_dst = internal::vqsub(v_max, v_min);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
}
};
} // namespace
#endif
void absDiff(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const u16 *src0Base, ptrdiff_t src0Stride,
const u16 *src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s8 *src0Base, ptrdiff_t src0Stride,
const s8 *src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s16 *src0Base, ptrdiff_t src0Stride,
const s16 *src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s32 *src0Base, ptrdiff_t src0Stride,
const s32 *src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,408 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
void accumulate(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j);
int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
v_dst0 = vqaddq_s16(v_dst0, v_src0);
v_dst1 = vqaddq_s16(v_dst1, v_src1);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
int16x8_t v_dst = vld1q_s16(dst + j);
v_dst = vqaddq_s16(v_dst, v_src16);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void accumulateSquareConst(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
}
}
}
template <>
void accumulateSquareConst<0>(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
}
}
}
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride);
} // namespace
#endif
void accumulateSquare(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride,
u32 shift)
{
if (shift >= 16)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
accumulateSquareConstFunc funcs[16] =
{
accumulateSquareConst<0>,
accumulateSquareConst<1>,
accumulateSquareConst<2>,
accumulateSquareConst<3>,
accumulateSquareConst<4>,
accumulateSquareConst<5>,
accumulateSquareConst<6>,
accumulateSquareConst<7>,
accumulateSquareConst<8>,
accumulateSquareConst<9>,
accumulateSquareConst<10>,
accumulateSquareConst<11>,
accumulateSquareConst<12>,
accumulateSquareConst<13>,
accumulateSquareConst<14>,
accumulateSquareConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
#ifdef CAROTENE_NEON
namespace {
struct AccumulateWeightedHalf
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vhaddq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vhadd_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
}
};
struct AccumulateWeighted
{
typedef u8 type;
float alpha, beta;
float32x4_t v_alpha, v_beta;
explicit AccumulateWeighted(float _alpha) :
alpha(_alpha), beta(1 - _alpha)
{
v_alpha = vdupq_n_f32(alpha);
v_beta = vdupq_n_f32(beta);
}
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
}
void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
uint8x8_t & v_dst) const
{
uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vmovn_u16(_v_dst);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = beta * src1[0] + alpha * src0[0];
}
};
} // namespace
#endif
void accumulateWeighted(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride,
f32 alpha)
{
if (alpha == 0.0f)
return;
if (alpha == 1.0f)
{
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memcpy(dst, src, sizeof(u8) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// in this case we can use the following scheme:
// dst[p] = (src[p] + dst[p]) >> 1
// which is faster
if (alpha == 0.5f)
{
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeightedHalf());
return;
}
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeighted(alpha));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)alpha;
#endif
}
} //namespace CAROTENE_NS

View File

@ -0,0 +1,475 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct AddWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
}
};
template <typename T, typename WT>
struct AddSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
}
};
} // namespace
#endif
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u8, u16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u8, u16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (u16)src0[j] + (u16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u16, u32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u16, u32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u32, u64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u32, u64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,266 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
using namespace internal;
template <typename T> struct TypeTraits;
template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; };
template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; };
template <> struct TypeTraits<u16> { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; };
template <> struct TypeTraits<s16> { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; };
template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; };
template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; };
template <> struct TypeTraits<f32> { typedef f64 wide; typedef float32x4_t vec128; };
template <typename T> struct wAdd
{
typedef T type;
f32 alpha, beta, gamma;
typedef typename TypeTraits<T>::wide wtype;
wAdd<wtype> wideAdd;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma),
wideAdd(_alpha, _beta, _gamma) {}
void operator() (const typename VecTraits<T>::vec128 & v_src0,
const typename VecTraits<T>::vec128 & v_src1,
typename VecTraits<T>::vec128 & v_dst) const
{
typename VecTraits<wtype>::vec128 vrl, vrh;
wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
}
void operator() (const typename VecTraits<T>::vec64 & v_src0,
const typename VecTraits<T>::vec64 & v_src1,
typename VecTraits<T>::vec64 & v_dst) const
{
typename VecTraits<wtype>::vec128 vr;
wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
v_dst = vqmovn(vr);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<s32>
{
typedef s32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma);
}
void operator() (const VecTraits<s32>::vec128 & v_src0,
const VecTraits<s32>::vec128 & v_src1,
VecTraits<s32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_s32(v_src0);
float32x4_t vs2 = vcvtq_f32_s32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vroundq_s32_f32(vs1);
}
void operator() (const VecTraits<s32>::vec64 & v_src0,
const VecTraits<s32>::vec64 & v_src1,
VecTraits<s32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_s32(v_src0);
float32x2_t vs2 = vcvt_f32_s32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vround_s32_f32(vs1);
}
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
{
dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<u32>
{
typedef u32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma);
}
void operator() (const VecTraits<u32>::vec128 & v_src0,
const VecTraits<u32>::vec128 & v_src1,
VecTraits<u32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_u32(v_src0);
float32x4_t vs2 = vcvtq_f32_u32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vroundq_u32_f32(vs1);
}
void operator() (const VecTraits<u32>::vec64 & v_src0,
const VecTraits<u32>::vec64 & v_src1,
VecTraits<u32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_u32(v_src0);
float32x2_t vs2 = vcvt_f32_u32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vround_u32_f32(vs1);
}
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
{
dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<f32>
{
typedef f32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const VecTraits<f32>::vec128 & v_src0,
const VecTraits<f32>::vec128 & v_src1,
VecTraits<f32>::vec128 & v_dst) const
{
float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
v_dst = vmlaq_f32(vs1, v_src1, vbeta);
}
void operator() (const VecTraits<f32>::vec64 & v_src0,
const VecTraits<f32>::vec64 & v_src1,
VecTraits<f32>::vec64 & v_dst) const
{
float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
}
};
} // namespace
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride, \
f32 alpha, f32 beta, f32 gamma) \
{ \
internal::assertSupportedConfiguration(); \
wAdd<type> wgtAdd(alpha, \
beta, \
gamma); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
wgtAdd); \
}
#else
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t, \
f32, f32, f32) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
IMPL_ADDWEIGHTED(u8)
IMPL_ADDWEIGHTED(s8)
IMPL_ADDWEIGHTED(u16)
IMPL_ADDWEIGHTED(s16)
IMPL_ADDWEIGHTED(u32)
IMPL_ADDWEIGHTED(s32)
IMPL_ADDWEIGHTED(f32)
} // namespace CAROTENE_NS

View File

@ -0,0 +1,225 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
struct BitwiseAnd
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vandq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vand_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] & src1[0];
}
};
struct BitwiseOr
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vorrq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vorr_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] | src1[0];
}
};
struct BitwiseXor
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = veorq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = veor_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] ^ src1[0];
}
};
#endif
void bitwiseNot(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src + j);
uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
vst1q_u8(dst + j, v_dst0);
vst1q_u8(dst + j + 16, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_dst = vmvn_u8(v_src);
vst1_u8(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = ~src[j];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseAnd(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseAnd());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseOr(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseOr());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseXor(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseXor());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,773 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct RowFilter3x3Canny
{
inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
{
vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
lookLeft = offsetk - borderxl;
lookRight = offsetk - borderxr;
}
inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
{
uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
ptrdiff_t i = 0;
for (; i < width - 8 + lookRight; i += 8)
{
internal::prefetch(src + i);
uint8x8_t l18u = vld1_u8(src + i + 1);
uint8x8_t l2 = l18u;
uint8x8_t l0 = vext_u8(l, l18u, 6);
int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
l = l18u;
int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
int16x8_t ldy = vaddq_s16(l02, l1x2);
vst1q_s16(dstx + i, ldx);
vst1q_s16(dsty + i, ldy);
}
//tail
if (lookRight == 0 || i != width)
{
uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
vst1q_s16(dstx + (width - 8), taildx);
vst1q_s16(dsty + (width - 8), taildy);
}
}
uint8x8_t vfmask;
uint8x8_t vtmask;
enum { offsetk = 1};
ptrdiff_t lookLeft;
ptrdiff_t lookRight;
};
template <bool L2gradient>
inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL1Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int16x8_t dya = vabsq_s16(dy);
int16x8_t dxa = vabsq_s16(dx);
int16x8_t norm = vaddq_s16(dya, dxa);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j + 4, normh);
vst1q_s32(mag + j, norml);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL1Loop;
}
}
template <>
inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL2Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j, norml);
vst1q_s32(mag + j + 4, normh);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL2Loop;
}
}
template <bool L2gradient>
inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vx2 = vld1q_s16(_dx + j + 8);
int16x8_t vy2 = vld1q_s16(_dy + j + 8);
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
vx = vx2;
vy = vy2;
}
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
}
for (; j < colscn; j++)
_norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
}
template <>
inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vxnext = vld1q_s16(_dx + j + 8);
int16x8_t vynext = vld1q_s16(_dy + j + 8);
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
vx = vxnext;
vy = vynext;
}
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
}
for (; j < colscn; j++)
_norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
}
template <bool L2gradient>
inline void prepareThresh(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <>
inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
if (low_thresh > 0) low_thresh *= low_thresh;
if (high_thresh > 0) high_thresh *= high_thresh;
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <bool L2gradient, bool externalSobel>
struct _normEstimator
{
ptrdiff_t magstep;
ptrdiff_t dxOffset;
ptrdiff_t dyOffset;
ptrdiff_t shxOffset;
ptrdiff_t shyOffset;
std::vector<u8> buffer;
const ptrdiff_t offsetk;
ptrdiff_t borderyt, borderyb;
RowFilter3x3Canny sobelRow;
inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
offsetk(1),
sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
{
mapstep = size.width + 2;
magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
dxOffset = mapstep * sizeof(s32)/sizeof(s16);
dyOffset = dxOffset + size.width * 1;
shxOffset = dxOffset + size.width * 2;
shyOffset = dxOffset + size.width * 3;
buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + magstep;
mag_buf[2] = mag_buf[1] + magstep;
memset(mag_buf[0], 0, mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + magstep);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
}
inline void firstRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
s32** mag_buf)
{
//sobelH row #0
const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
//sobelH row #1
_src = internal::getRowPtr(srcBase, srcStride, 1);
sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
if (borderyt == 0)
{
//sobelH row #-1
_src = internal::getRowPtr(srcBase, srcStride, -1);
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
else
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
}
inline void nextRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
if (i < size.height - borderyb)
{
const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
//sobelH row #i+1
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else if (i < size.height)
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else
memset(mag_buf[2], 0, mapstep*sizeof(s32));
_x = ((s16*)mag_buf[1]) + dxOffset;
_y = ((s16*)mag_buf[1]) + dyOffset;
}
};
template <bool L2gradient>
struct _normEstimator<L2gradient, true>
{
std::vector<u8> buffer;
inline _normEstimator(const Size2D &size, s32 cn, Margin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
{
mapstep = size.width + 2;
buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + mapstep*cn;
mag_buf[2] = mag_buf[1] + mapstep*cn;
memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + mapstep*cn);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
}
inline void firstRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
s32** mag_buf)
{
s32* _norm = mag_buf[1] + 1;
s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
inline void nextRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
s32* _norm = mag_buf[(i > 0) + 1] + 1;
if (i < size.height)
{
s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
else
memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
_x = internal::getRowPtr(dxBase, dxStride, i-1);
_y = internal::getRowPtr(dyBase, dyStride, i-1);
}
};
template <bool L2gradient, bool externalSobel>
inline void Canny3x3(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
s32 low, high;
prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
ptrdiff_t mapstep;
s32* mag_buf[3];
u8* map;
_normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
std::vector<u8*> stack( maxsize );
u8 **stack_top = &stack[0];
u8 **stack_bottom = &stack[0];
/* sector numbers
(Top-Left Origin)
1 2 3
* * *
* * *
0*******0
* * *
* * *
3 2 1
*/
#define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d)
#define CANNY_POP(d) (d) = *--stack_top
//i == 0
normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
// calculate magnitude and angle of gradient, perform non-maxima supression.
// fill the map with one of the following values:
// 0 - the pixel might belong to an edge
// 1 - the pixel can not belong to an edge
// 2 - the pixel does belong to an edge
for (size_t i = 1; i <= size.height; i++)
{
const s16 *_x, *_y;
normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
u8* _map = map + mapstep*i + 1;
_map[-1] = _map[size.width] = 1;
s32* _mag = mag_buf[1] + 1; // take the central row
ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
if ((stack_top - stack_bottom) + size.width > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
s32 prev_flag = 0;
for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
{
#define CANNY_SHIFT 15
const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
s32 m = _mag[j];
if (m > low)
{
s32 xs = _x[j];
s32 ys = _y[j];
s32 x = abs(xs);
s32 y = abs(ys) << CANNY_SHIFT;
s32 tg22x = x * TG22;
if (y < tg22x)
{
if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
}
else
{
s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
if (y > tg67x)
{
if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
}
else
{
s32 s = (xs ^ ys) < 0 ? -1 : 1;
if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
}
}
}
prev_flag = 0;
_map[j] = u8(1);
continue;
__push:
if (!prev_flag && m > high && _map[j-mapstep] != 2)
{
CANNY_PUSH(_map + j);
prev_flag = 1;
}
else
_map[j] = 0;
}
// scroll the ring buffer
_mag = mag_buf[0];
mag_buf[0] = mag_buf[1];
mag_buf[1] = mag_buf[2];
mag_buf[2] = _mag;
}
// now track the edges (hysteresis thresholding)
while (stack_top > stack_bottom)
{
u8* m;
if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
CANNY_POP(m);
if (!m[-1]) CANNY_PUSH(m - 1);
if (!m[1]) CANNY_PUSH(m + 1);
if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
if (!m[-mapstep]) CANNY_PUSH(m - mapstep);
if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1);
if (!m[mapstep]) CANNY_PUSH(m + mapstep);
if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1);
}
// the final pass, form the final image
uint8x16_t v2 = vmovq_n_u8(2);
const u8* ptrmap = map + mapstep + 1;
for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
{
u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
ptrdiff_t j = 0;
for (; j < (ptrdiff_t)size.width - 16; j += 16)
{
internal::prefetch(ptrmap);
uint8x16_t vmap = vld1q_u8(ptrmap + j);
uint8x16_t vdst = vceqq_u8(vmap, v2);
vst1q_u8(_dst+j, vdst);
}
for (; j < (ptrdiff_t)size.width; j++)
_dst[j] = (u8)-(ptrmap[j] >> 1);
}
}
} // namespace
#endif
bool isCanny3x3Supported(const Size2D &size)
{
return isSupportedConfiguration() &&
size.height >= 2 && size.width >= 9;
}
void Canny3x3L1(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<false, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<true, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L1(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<false, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
void Canny3x3L2(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<true, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,486 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
void extract2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef __ANDROID__
for (; dj < roiw32; sj += 64, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x2_t v_src = vld2q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld2q_u8(src + sj + 32);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 16, dj += 8)
{
uint8x8x2_t v_src = vld2_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 2, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef __ANDROID__
for (; dj < roiw32; sj += 96, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x3_t v_src = vld3q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld3q_u8(src + sj + 48);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 24, dj += 8)
{
uint8x8x3_t v_src = vld3_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 3, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract4(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef __ANDROID__
for (; dj < roiw32; sj += 128, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x4_t v_src = vld4q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld4q_u8(src + sj + 64);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 32, dj += 8)
{
uint8x8x4_t v_src = vld4_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 4, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTDST2 srcStride == dst0Stride && \
srcStride == dst1Stride &&
#define CONTDST3 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride &&
#define CONTDST4 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride && \
srcStride == dst3Stride &&
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
#define SPLIT_ASM2(sgn, bits) __asm__ ( \
"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define SPLIT_ASM3(sgn, bits) __asm__ ( \
"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define SPLIT_ASM4(sgn, bits) __asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
SPLIT_ASM##n(sgn, bits) \
}
#else
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
FILL_LINES##n(VST1Q, sgn##bits) \
}
#endif
#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##bits) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
SPLIT_QUAD(sgn, bits, n) \
\
if (dj < roiw8) \
{ \
vec64 v_src = vld##n##_##sgn##bits(src + sj); \
FILL_LINES##n(VST1, sgn##bits) \
sj += MUL##n(8)/sizeof(sgn##bits); \
dj += 8/sizeof(sgn##bits); \
} \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
FILL_LINES##n(SST, sgn##bits) \
} \
} \
}
#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
const sgn##64 * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##64) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##64) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
vec64 v_src = vld##n##_##sgn##64(src + sj); \
FILL_LINES##n(VST1, sgn##64) \
} \
} \
}
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
__asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
: \
: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
); \
}
#else
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
union { vec128_4 v4; vec128_3 v3; } vals; \
vals.v4 = vld4q_##sgn##bits(src + sj); \
vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
}
#endif
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dst3Stride && \
srcStride == dst1Stride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
size_t sj = 0u, d3j = 0u, d1j = 0u; \
\
for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
d1j += 16/sizeof(sgn##bits)) \
ALPHA_QUAD(sgn, bits) \
\
if (d1j < roiw8) \
{ \
union { vec64_4 v4; vec64_3 v3; } vals; \
vals.v4 = vld4_##sgn##bits(src + sj); \
vst3_u8(dst3 + d3j, vals.v3); \
vst1_u8(dst1 + d1j, vals.v4.val[3]); \
sj += MUL4(8)/sizeof(sgn##bits); \
d3j += MUL3(8)/sizeof(sgn##bits); \
d1j += 8/sizeof(sgn##bits); \
} \
\
for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
{ \
dst3[d3j+0] = src[sj + 0]; \
dst3[d3j+1] = src[sj + 1]; \
dst3[d3j+2] = src[sj + 2]; \
dst1[d1j] = src[sj + 3]; \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
FILL_LINES##n(VOID, sgn##bits) \
}
#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
(void)dst3Base; \
(void)dst3Stride; \
(void)dst1Base; \
(void)dst1Stride; \
}
#endif //CAROTENE_NEON
SPLIT(u, 8,2)
SPLIT(u, 8,3)
SPLIT(u, 8,4)
SPLIT(u,16,2)
SPLIT(u,16,3)
SPLIT(u,16,4)
SPLIT(s,32,2)
SPLIT(s,32,3)
SPLIT(s,32,4)
SPLIT64(s, 2)
SPLIT64(s, 3)
SPLIT64(s, 4)
SPLIT4ALPHA(u,8)
} // namespace CAROTENE_NS

View File

@ -0,0 +1,389 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTSRC2 dstStride == src0Stride && \
dstStride == src1Stride &&
#define CONTSRC3 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride &&
#define CONTSRC4 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride && \
dstStride == src3Stride &&
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
#define MERGE_ASM2(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define MERGE_ASM3(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define MERGE_ASM4(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define MERGE_QUAD(sgn, bits, n) { \
FILL_LINES##n(PREF, sgn##bits) \
MERGE_ASM##n(sgn, bits) \
}
#else
#define MERGE_QUAD(sgn, bits, n) { \
vec128 v_dst; \
/*FILL_LINES##n(PREF, sgn##bits) \
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
FILL_LINES##n(PRLD, sgn##bits) \
vst##n##q_##sgn##bits(dst + dj, v_dst); \
}
#endif
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##bits) \
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
MERGE_QUAD(sgn, bits, n) \
\
if ( sj < roiw8 ) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##bits) \
vst##n##_##sgn##bits(dst + dj, v_dst); \
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
} \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
FILL_LINES##n(SLD, sgn##bits) \
} \
} \
}
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##64), \
sgn##64 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##64) \
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##64) \
vst##n##_##sgn##64(dst + dj, v_dst); \
/*FILL_LINES##n(SLD, sgn##64)*/ \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
FILL_LINES##n(VOID, sgn##bits) \
(void)dstBase; \
(void)dstStride; \
}
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
#endif //CAROTENE_NEON
COMBINE(u, 8,2)
COMBINE(u, 8,3)
COMBINE(u, 8,4)
COMBINE(u,16,2)
COMBINE(u,16,3)
COMBINE(u,16,4)
COMBINE(s,32,2)
COMBINE(s,32,3)
COMBINE(s,32,4)
COMBINE64(s, 2)
COMBINE64(s, 3)
COMBINE64(s, 4)
void combineYUYV(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; i += 1)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef __ANDROID__
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj);
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1_u8(srcv + sj);
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcy[syj];
dst[dj + 1] = srcu[sj];
dst[dj + 2] = srcy[syj + 1];
dst[dj + 3] = srcv[sj];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
void combineUYVY(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef __ANDROID__
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = vld1q_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = vld1_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcu[sj];
dst[dj + 1] = srcy[syj];
dst[dj + 2] = srcv[sj];
dst[dj + 3] = srcy[syj + 1];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename Op, int elsize> struct vtail
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
//do nothing since there couldn't be enough data
(void)src0;
(void)src1;
(void)dst;
(void)op;
(void)x;
(void)width;
}
};
template <typename Op> struct vtail<Op, 2>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, internal::vmovn(v_dst));
x+=8;
}
}
};
template <typename Op> struct vtail<Op, 1>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<type>::vec64 vec64;
typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1q(dst + x, v_dst);
x+=16;
}
if( x + 8 < width)
{
vec64 v_src0, v_src1;
uvec64 v_dst;
v_src0 = internal::vld1(src0 + x);
v_src1 = internal::vld1(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, v_dst);
x+=8;
}
}
};
template <typename Op>
void vcompare(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const u32 step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
uvec128 v_dst0;
uvec128 v_dst1;
op(v_src00, v_src10, v_dst0);
op(v_src01, v_src11, v_dst1);
vnst(dst + x, v_dst0, v_dst1);
}
vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
template<typename T>
struct OpCmpEQ
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vceqq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vceq(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpNE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 0 : 255;
}
};
template<typename T>
struct OpCmpGT
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgtq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcgt(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] > src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpGE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgeq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcge(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] >= src1[0] ? 255 : 0;
}
};
}
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
vcompare(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
OpCmp##op<type>()); \
}
#else
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)src0Base; \
(void)src0Stride; \
(void)src1Base; \
(void)src1Stride; \
(void)dstBase; \
(void)dstStride; \
}
#endif
IMPL_CMPOP(EQ, u8)
IMPL_CMPOP(EQ, s8)
IMPL_CMPOP(EQ, u16)
IMPL_CMPOP(EQ, s16)
IMPL_CMPOP(EQ, u32)
IMPL_CMPOP(EQ, s32)
IMPL_CMPOP(EQ, f32)
IMPL_CMPOP(NE, u8)
IMPL_CMPOP(NE, s8)
IMPL_CMPOP(NE, u16)
IMPL_CMPOP(NE, s16)
IMPL_CMPOP(NE, u32)
IMPL_CMPOP(NE, s32)
IMPL_CMPOP(NE, f32)
IMPL_CMPOP(GT, u8)
IMPL_CMPOP(GT, s8)
IMPL_CMPOP(GT, u16)
IMPL_CMPOP(GT, s16)
IMPL_CMPOP(GT, u32)
IMPL_CMPOP(GT, s32)
IMPL_CMPOP(GT, f32)
IMPL_CMPOP(GE, u8)
IMPL_CMPOP(GE, s8)
IMPL_CMPOP(GE, u16)
IMPL_CMPOP(GE, s16)
IMPL_CMPOP(GE, u32)
IMPL_CMPOP(GE, s32)
IMPL_CMPOP(GE, f32)
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,108 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cstdlib>
#include <iostream>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSupportedConfiguration()
{
#ifdef CAROTENE_NEON
return true;
#else
return false;
#endif
}
namespace internal {
void assertSupportedConfiguration(bool parametersSupported)
{
if (!isSupportedConfiguration()) {
std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
std::abort();
}
if (!parametersSupported) {
std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
std::abort();
}
}
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
{
ptrdiff_t p = _p + (ptrdiff_t)startMargin;
size_t len = _len + startMargin + endMargin;
if( (size_t)p < len )
return _p;
else if( borderType == BORDER_MODE_REPLICATE )
p = p < 0 ? 0 : (ptrdiff_t)len - 1;
else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
{
s32 delta = borderType == BORDER_MODE_REFLECT101;
if( len == 1 )
return 0;
do
{
if( p < 0 )
p = -p - 1 + delta;
else
p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
}
while( (size_t)p >= len );
}
else if( borderType == BORDER_MODE_WRAP )
{
if( p < 0 )
p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
if( p >= (ptrdiff_t)len )
p %= (ptrdiff_t)len;
}
else if( borderType == BORDER_MODE_CONSTANT )
p = -1;
else
internal::assertSupportedConfiguration(false);
return p - (ptrdiff_t)startMargin;
}
} // namespace internal
} // namespace CAROTENE_NS

View File

@ -0,0 +1,108 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_COMMON_HPP
#define CAROTENE_SRC_COMMON_HPP
#include <cstddef>
#include <cstdlib>
#include <algorithm>
#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
#define CAROTENE_NEON
#endif
#ifdef CAROTENE_NEON
#include <arm_neon.h>
#include "intrinsics.hpp"
#endif
#include <carotene/functions.hpp>
#include "saturate_cast.hpp"
namespace CAROTENE_NS { namespace internal {
#ifndef CAROTENE_NEON_ARCH
# if defined(__aarch64__) || defined(__aarch32__)
# define CAROTENE_NEON_ARCH 8
# else
# define CAROTENE_NEON_ARCH 7
# endif
#endif
#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
# error("ARMv7 doen't support A32/A64 Neon instructions")
#endif
inline void prefetch(const void *ptr, size_t offset = 32*10)
{
#if defined __GNUC__
__builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
#elif defined _MSC_VER && defined CAROTENE_NEON
__prefetch(reinterpret_cast<const char*>(ptr) + offset);
#else
(void)ptr;
(void)offset;
#endif
}
template <typename T>
inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
{
char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
}
void assertSupportedConfiguration(bool parametersSupported = true);
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
/*!
* Aligns pointer by the certain number of bytes
*
* This small inline function aligns the pointer by the certain number of bytes by shifting
* it forward by 0 or a positive offset.
*/
template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
{
return (T*)(((size_t)ptr + n-1) & -n);
}
}}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,399 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void lshiftConst(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
}
for (; j < size.width; j++)
{
dst[j] = ((s16)src[j] << shift);
}
}
}
template <>
void lshiftConst<0>(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = (s16)src[j];
}
}
}
template <int shift>
void rshiftConst(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)((src[j] >> shift));
}
}
}
}
template <>
void rshiftConst<0>(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>(src[j]);
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)src[j];
}
}
}
}
typedef void (* lshiftConstFunc)(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride);
typedef void (* rshiftConstFunc)(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy);
} // namespace
#endif
void lshift(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
u32 shift)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16u)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
lshiftConstFunc funcs[16] =
{
lshiftConst<0>,
lshiftConst<1>,
lshiftConst<2>,
lshiftConst<3>,
lshiftConst<4>,
lshiftConst<5>,
lshiftConst<6>,
lshiftConst<7>,
lshiftConst<8>,
lshiftConst<9>,
lshiftConst<10>,
lshiftConst<11>,
lshiftConst<12>,
lshiftConst<13>,
lshiftConst<14>,
lshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
void rshift(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 shift, CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16)
{
if (cpolicy == CONVERT_POLICY_WRAP)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_zero = vdupq_n_s16(0);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
vmovn_u16(vcltq_s16(v_src1, v_zero)));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
}
for (; j < size.width; j++)
{
dst[j] = src[j] >= 0 ? 0 : 255;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(u8) * size.width);
}
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
rshiftConstFunc funcs[16] =
{
rshiftConst<0>,
rshiftConst<1>,
rshiftConst<2>,
rshiftConst<3>,
rshiftConst<4>,
rshiftConst<5>,
rshiftConst<6>,
rshiftConst<7>,
rshiftConst<8>,
rshiftConst<9>,
rshiftConst<10>,
rshiftConst<11>,
rshiftConst<12>,
rshiftConst<13>,
rshiftConst<14>,
rshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
(void)cpolicy;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
namespace CAROTENE_NS {
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE) &&
(ksize.width == 3) && (ksize.height == 3);
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
int32x4_t vshrq_s32(int32x4_t value)
{
return vshrq_n_s32(value, shift);
}
template <>
int32x4_t vshrq_s32<0>(int32x4_t value)
{
return value;
}
} // namespace
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
#endif
void convolution(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue,
const Size2D & ksize, s16 * kernelBase, u32 scale)
{
internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
#ifdef CAROTENE_NEON
const uint8x8_t v_zero_u8 = vdup_n_u8(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
const int32x4_t v_zero_s32 = vdupq_n_s32(0);
uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
static const vshrq_s32_func vshrq_s32_a[33] =
{
vshrq_s32<0>,
vshrq_s32<1>,
vshrq_s32<2>,
vshrq_s32<3>,
vshrq_s32<4>,
vshrq_s32<5>,
vshrq_s32<6>,
vshrq_s32<7>,
vshrq_s32<8>,
vshrq_s32<9>,
vshrq_s32<10>,
vshrq_s32<11>,
vshrq_s32<12>,
vshrq_s32<13>,
vshrq_s32<14>,
vshrq_s32<15>,
vshrq_s32<16>,
vshrq_s32<17>,
vshrq_s32<18>,
vshrq_s32<19>,
vshrq_s32<20>,
vshrq_s32<21>,
vshrq_s32<22>,
vshrq_s32<23>,
vshrq_s32<24>,
vshrq_s32<25>,
vshrq_s32<26>,
vshrq_s32<27>,
vshrq_s32<28>,
vshrq_s32<29>,
vshrq_s32<30>,
vshrq_s32<31>,
vshrq_s32<32>
};
vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx[3] = { 0, 0, 0 },
currx[3] = { 0, 0, 0 },
nextx[3] = { 0, 0, 0 };
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx[0] = prevx[1] = prevx[2] = borderValue;
else
{
prevx[0] = srow0 ? srow0[x4] : borderValue;
prevx[1] = srow1[x4] ;
prevx[2] = srow2 ? srow2[x4] : borderValue;
}
currx[0] = srow0 ? srow0[x3] : borderValue;
currx[1] = srow1[x3] ;
currx[2] = srow2 ? srow2[x3] : borderValue;
}
// make shift
if (x)
{
tprev[0] = tcurr[0];
tcurr[0] = tnext[0];
tprev[1] = tcurr[1];
tcurr[1] = tnext[1];
tprev[2] = tcurr[2];
tcurr[2] = tnext[2];
}
tnext[0] = x0;
tnext[1] = x1;
tnext[2] = x2;
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr[0] = tcurr[1] = tcurr[2] = v_border;
else if (border == BORDER_MODE_REPLICATE)
{
tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
}
continue;
}
int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[0], tcurr[0], 7);
t1 = tcurr[0];
t2 = vext_u8(tcurr[0], tnext[0], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[1], tcurr[1], 7);
t1 = tcurr[1];
t2 = vext_u8(tcurr[1], tnext[1], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[2], tcurr[2], 7);
t1 = tcurr[2];
t2 = vext_u8(tcurr[2], tnext[2], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
}
// make scale
v_dst0 = vshrq_s32_p(v_dst0);
v_dst1 = vshrq_s32_p(v_dst1);
// and add them
vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
vqmovun_s32(v_dst1))));
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
{
nextx[0] = borderValue;
nextx[1] = borderValue;
nextx[2] = borderValue;
}
else if (border == BORDER_MODE_REPLICATE)
{
nextx[0] = srow0[x];
nextx[1] = srow1[x];
nextx[2] = srow2[x];
}
}
else
{
nextx[0] = srow0 ? srow0[x + 1] : borderValue;
nextx[1] = srow1[x + 1] ;
nextx[2] = srow2 ? srow2[x + 1] : borderValue;
}
s32 val = 0;
for (s32 _y = 0; _y < 3; ++_y)
val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
drow[x] = internal::saturate_cast<u8>(val >> scale);
// make shift
prevx[0] = currx[0];
currx[0] = nextx[0];
prevx[1] = currx[1];
currx[1] = nextx[1];
prevx[2] = currx[2];
currx[2] = nextx[2];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
(void)ksize;
(void)kernelBase;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,430 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <limits>
namespace CAROTENE_NS {
s32 countNonZero(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw16 = size.width & ~15u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
uint8x16_t vc1 = vmovq_n_u8(1);
for (; i < roiw16;)
{
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
uint8x16_t vs = vmovq_n_u8(0);
for (; i <= lim; i+= 16)
{
internal::prefetch(src + i);
uint8x16_t vln = vld1q_u8(src + i);
uint8x16_t vnz = vminq_u8(vln, vc1);
vs = vaddq_u8(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
uint16x8_t vc1 = vmovq_n_u16(1);
for (; i < roiw8;)
{
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
uint16x8_t vs = vmovq_n_u16(0);
for (; i <= lim; i+= 8)
{
internal::prefetch(src + i);
uint16x8_t vln = vld1q_u16(src + i);
uint16x8_t vnz = vminq_u16(vln, vc1);
vs = vaddq_u16(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vs);
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
u32 i = 0;
uint32x4_t vc1 = vmovq_n_u32(1);
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
uint32x4_t vln = vld1q_u32(src + i);
uint32x4_t vnz = vminq_u32(vln, vc1);
vs = vqaddq_u32(vs, vnz);
}
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
float32x4_t vc0 = vmovq_n_f32(0);
int32x4_t vs = vmovq_n_s32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
float32x4_t vln = vld1q_f32(src + i);
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
vs = vqaddq_s32(vs, vnz);
}
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
int s[2];
vst1_s32(s, vs2);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f64 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
size_t roiw4 = size.width & ~3u;
size_t roiw2 = size.width & ~1u;
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
uint32x4_t vc0 = vmovq_n_u32(0);
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
int32x2_t vs1 = vmov_n_s32(0);
int32x2_t vs2 = vmov_n_s32(0);
int32x2_t vs3 = vmov_n_s32(0);
int32x2_t vs4 = vmov_n_s32(0);
for (; i < roiw8; i += 8 )
{
internal::prefetch(src + i + 6);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
uint32x4_t vlx3 = vmvnq_u32(vequ3);
uint32x4_t vlx4 = vmvnq_u32(vequ4);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
vs3 = vqadd_s32(vs3, vnz3);
vs4 = vqadd_s32(vs4, vnz4);
}
if (i < roiw4)
{
internal::prefetch(src + i + 2);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
i += 4;
}
if (i < roiw2)
{
internal::prefetch(src + i);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
vs1 = vqadd_s32(vs1, vnz1);
i += 2;
}
vs1 = vqadd_s32(vs1, vs2);
vs3 = vqadd_s32(vs3, vs4);
vs1 = vqadd_s32(vs1, vs3);
int32x2_t vsneg = vqneg_s32(vs1);
s32 s[2];
vst1_s32(s, vsneg);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,681 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include "vround_helper.hpp"
#include <cstring>
#include <cfloat>
#include <cmath>
#include <limits>
namespace CAROTENE_NS {
namespace {
#ifdef CAROTENE_NEON
template <typename T>
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divSaturate(const T &v1, const T &v2, const float scale)
{
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
template <typename T>
inline T divWrapQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divWrap(const T &v1, const T &v2, const float scale)
{
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
#endif
template <typename T>
void div(const Size2D &size,
const T * src0Base, ptrdiff_t src0Stride,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
#endif
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
(scale * static_cast<float>(std::numeric_limits<T>::max())) < 1.0f &&
(scale * static_cast<float>(std::numeric_limits<T>::max())) > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
#ifdef CAROTENE_NEON
template <typename T>
inline T recipSaturateQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipSaturate(const T &v2, const float scale)
{
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrapQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrap(const T &v2, const float scale)
{
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
#endif
template <typename T>
void recip(const Size2D &size,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
#endif
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
scale < 1.0f &&
scale > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
}
void div(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1)));
}
for (; j < size.width; j++)
{
dst[j] = src0[j] / src1[j];
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale),
internal::vrecpq_f32(v_src1)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale),
internal::vrecp_f32(v_src1)));
}
for (; j < size.width; j++)
{
dst[j] = src0[j] * scale / src1[j];
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
void reciprocal(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, internal::vrecpq_f32(v_src1));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, internal::vrecp_f32(v_src1));
}
for (; j < size.width; j++)
{
dst[j] = 1.0f / src1[j];
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale));
}
for (; j < size.width; j++)
{
dst[j] = scale / src1[j];
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,260 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
f64 dotProduct(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
uint64x2_t ws = vmovq_n_u64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
uint32x4_t s1 = vmovq_n_u32(0);
uint32x4_t s2 = vmovq_n_u32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
uint8x16_t vs1 = vld1q_u8(src0 + i);
uint8x16_t vs2 = vld1q_u8(src1 + i);
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
s1 = vpadalq_u16(s1, vdot1);
s2 = vpadalq_u16(s2, vdot2);
}
ws = vpadalq_u32(ws, s1);
ws = vpadalq_u32(ws, s2);
}
if(i + 8 <= size.width)
{
uint8x8_t vs1 = vld1_u8(src0 + i);
uint8x8_t vs2 = vld1_u8(src1 + i);
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
int64x2_t ws = vmovq_n_s64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
int32x4_t s1 = vmovq_n_s32(0);
int32x4_t s2 = vmovq_n_s32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
int8x16_t vs1 = vld1q_s8(src0 + i);
int8x16_t vs2 = vld1q_s8(src1 + i);
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
s1 = vpadalq_s16(s1, vdot1);
s2 = vpadalq_s16(s2, vdot2);
}
ws = vpadalq_s32(ws, s1);
ws = vpadalq_s32(ws, s2);
}
if(i + 8 <= size.width)
{
int8x8_t vs1 = vld1_s8(src0 + i);
int8x8_t vs2 = vld1_s8(src1 + i);
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
{
size.width *= size.height;
size.height = 1;
}
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
while(i + 4 <= size.width)
{
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
float32x4_t v_sum = vdupq_n_f32(0.0f);
for( ; i <= lim; i += 4 )
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
}
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
}
if(i + 2 <= size.width)
{
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
i += 2;
}
for (; i < size.width; ++i)
result += src0[i] * src1[i];
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,2 @@
// This file is needed for compilation on some platforms e.g. with XCode generator
// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457

View File

@ -0,0 +1,428 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
Below is the original copyright and the references */
/*
Copyright (c) 2006, 2008 Edward Rosten
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
*Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
*Neither the name of the University of Cambridge nor the names of
its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
The references are:
* Machine learning for high-speed corner detection,
E. Rosten and T. Drummond, ECCV 2006
* Faster and better: A machine learning approach to corner detection
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace
{
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
{
pixel[0] = 0 + row_stride * 3;
pixel[1] = 1 + row_stride * 3;
pixel[2] = 2 + row_stride * 2;
pixel[3] = 3 + row_stride * 1;
pixel[4] = 3 + row_stride * 0;
pixel[5] = 3 + row_stride * -1;
pixel[6] = 2 + row_stride * -2;
pixel[7] = 1 + row_stride * -3;
pixel[8] = 0 + row_stride * -3;
pixel[9] = -1 + row_stride * -3;
pixel[10] = -2 + row_stride * -2;
pixel[11] = -3 + row_stride * -1;
pixel[12] = -3 + row_stride * 0;
pixel[13] = -3 + row_stride * 1;
pixel[14] = -2 + row_stride * 2;
pixel[15] = -1 + row_stride * 3;
}
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
{
const s32 K = 8, N = 16 + K + 1;
s32 k, v = ptr[0];
s16 d[(N + 7) & ~7];
for( k = 0; k < N; k++ )
d[k] = (s16)(v - ptr[pixel[k]]);
int16x8_t q0 = vdupq_n_s16((s16)(-1000));
int16x8_t q1 = vdupq_n_s16((s16)(1000));
int16x8_t d0_7 = vld1q_s16(d + 0);
int16x8_t d8_15 = vld1q_s16(d + 8);
int16x8_t d16_23 = vld1q_s16(d + 16);
int16x8_t d24 = vld1q_s16(d + 24);
//k == 0
int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
int16x8_t ak0 = vminq_s16(v0k0, v1k0);
int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 3);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 4);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 5);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 6);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 7);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
ak0 = vminq_s16(ak0, d8_15);
bk0 = vmaxq_s16(bk0, d8_15);
q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
v1k0 = vextq_s16(d8_15, d16_23, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
//k == 8
int16x8_t v0k8 = v1k0;
int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
int16x8_t ak8 = vminq_s16(v0k8, v1k8);
int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 3);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 4);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 5);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 6);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 7);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
ak8 = vminq_s16(ak8, d16_23);
bk8 = vmaxq_s16(bk8, d16_23);
q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
v1k8 = vextq_s16(d16_23, d24, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
//fin
int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
int32x4_t q2w = vmovl_s16(q2);
int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
return (u8)(vget_lane_s32(q8, 0) - 1);
}
} //namespace
#endif
void FAST(const Size2D &size,
u8 *srcBase, ptrdiff_t srcStride,
KeypointStore *keypoints,
u8 threshold, bool nonmax_suppression)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
//keypoints.clear();
const s32 K = 8, N = 16 + K + 1;
ptrdiff_t i, j, k, pixel[N];
makeOffsets(pixel, srcStride);
for(k = 16; k < N; k++)
pixel[k] = pixel[k - 16];
uint8x16_t delta = vdupq_n_u8(128);
uint8x16_t t = vdupq_n_u8(threshold);
uint8x16_t K16 = vdupq_n_u8((u8)K);
u8 threshold_tab[512];
for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
u8* buf[3];
buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
ptrdiff_t* cpbuf[3];
cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
cpbuf[1] = cpbuf[0] + size.width + 1;
cpbuf[2] = cpbuf[1] + size.width + 1;
memset(buf[0], 0, size.width*3);
for(i = 3; i < (ptrdiff_t)size.height-2; i++)
{
const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
u8* curr = buf[(i - 3)%3];
ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
memset(curr, 0, size.width);
ptrdiff_t ncorners = 0;
if( i < (ptrdiff_t)size.height - 3 )
{
j = 3;
for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
{
internal::prefetch(ptr);
internal::prefetch(ptr + pixel[0]);
internal::prefetch(ptr + pixel[2]);
uint8x16_t v0 = vld1q_u8(ptr);
int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
m0 = vorrq_u8(m0, m1);
u64 mask[2];
vst1q_u64(mask, vreinterpretq_u64_u8(m0));
if( mask[0] == 0 )
{
if (mask[1] != 0)
{
j -= 8;
ptr -= 8;
}
continue;
}
uint8x16_t c0 = vmovq_n_u8(0);
uint8x16_t c1 = vmovq_n_u8(0);
uint8x16_t max0 = vmovq_n_u8(0);
uint8x16_t max1 = vmovq_n_u8(0);
for( k = 0; k < N; k++ )
{
int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
m0 = vcgtq_s8(x, v2);
m1 = vcgtq_s8(v1, x);
c0 = vandq_u8(vsubq_u8(c0, m0), m0);
c1 = vandq_u8(vsubq_u8(c1, m1), m1);
max0 = vmaxq_u8(max0, c0);
max1 = vmaxq_u8(max1, c1);
}
max0 = vmaxq_u8(max0, max1);
u8 m[16];
vst1q_u8(m, vcgtq_u8(max0, K16));
for( k = 0; k < 16; ++k )
if(m[k])
{
cornerpos[ncorners++] = j+k;
if(nonmax_suppression)
curr[j+k] = cornerScore(ptr+k, pixel);
}
}
for( ; j < (s32)size.width - 3; j++, ptr++ )
{
s32 v = ptr[0];
const u8* tab = &threshold_tab[0] - v + 255;
s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
if( d & 1 )
{
s32 vt = v - threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x < vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
if( d & 2 )
{
s32 vt = v + threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x > vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
}
}
cornerpos[-1] = ncorners;
if( i == 3 )
continue;
const u8* prev = buf[(i - 4 + 3)%3];
const u8* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3];
ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ )
{
j = cornerpos[k];
s32 score = prev[j];
if( !nonmax_suppression ||
(score > prev[j+1] && score > prev[j-1] &&
score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
{
keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)keypoints;
(void)threshold;
(void)nonmax_suppression;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,442 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void process(const T * src, size_t j0, size_t j1, size_t i,
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
for (size_t j = j0; j < j1; ++j)
{
T val = src[j];
if (val == maxVal)
{
if (maxLocCount < maxLocCapacity)
{
maxLocPtr[maxLocCount] = j;
maxLocPtr[maxLocCount + 1] = i;
}
maxLocCount += 2;
}
if (val == minVal)
{
if (minLocCount < minLocCapacity)
{
minLocPtr[minLocCount] = j;
minLocPtr[minLocCount + 1] = i;
}
minLocCount += 2;
}
}
}
} // namespace
#endif
void fillMinMaxLocs(const Size2D & size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
vst1q_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
v_minval8 = vdupq_n_u16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint16x8_t v_src = vld1q_u16(src + j);
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
v_minval8 = vdupq_n_s16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int16x8_t v_src = vld1q_s16(src + j);
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
v_minval4 = vdupq_n_s32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
v_minval4 = vdupq_n_u32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,222 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
{
bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
return isSupportedConfiguration() &&
((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
(flipMode == FLIP_VERTICAL_MODE));
}
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void flip(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t js = 0, jd = size.width;
for (; js < roiw_base; js += step_base, jd -= step_base)
{
prefetch(src + js);
vec128 v_src = vld1q(src + js);
vec128 v_dst = vrev64q(v_src);
v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
vst1q(dst + jd - step_base, v_dst);
}
for (; js < roiw_tail; js += step_tail, jd -= step_tail)
{
vec64 v_src = vld1(src + js);
vst1(dst + jd - step_tail, vrev64(v_src));
}
for (--jd; js < size.width; ++js, --jd)
dst[jd] = src[js];
}
}
template <typename T>
void flip3(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
#ifndef __ANDROID__
typedef typename VecTraits<T, 3>::vec128 vec128;
#endif
typedef typename VecTraits<T, 3>::vec64 vec64;
#ifndef __ANDROID__
u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
#endif
u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t j = 0, js = 0, jd = size.width * 3;
#ifndef __ANDROID__
for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
{
prefetch(src + js);
vec128 v_src = vld3q(src + js), v_dst;
v_src.val[0] = vrev64q(v_src.val[0]);
v_src.val[1] = vrev64q(v_src.val[1]);
v_src.val[2] = vrev64q(v_src.val[2]);
v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
vst3q(dst + jd - step_base3, v_dst);
}
#endif // __ANDROID__
for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
{
vec64 v_src = vld3(src + js), v_dst;
v_dst.val[0] = vrev64(v_src.val[0]);
v_dst.val[1] = vrev64(v_src.val[1]);
v_dst.val[2] = vrev64(v_src.val[2]);
vst3(dst + jd - step_tail3, v_dst);
}
for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
{
dst[jd] = src[js];
dst[jd + 1] = src[js + 1];
dst[jd + 2] = src[js + 2];
}
}
}
typedef void (* flipFunc)(const Size2D &size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode);
} // namespace
#endif
void flip(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode, u32 elemSize)
{
internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
#ifdef CAROTENE_NEON
if (flipMode == FLIP_VERTICAL_MODE)
{
for (size_t y = 0; y < size.height; ++y)
{
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
std::memcpy(dst_row, src_row, elemSize * size.width);
}
return;
}
flipFunc func = NULL;
if (elemSize == (u32)sizeof(u8))
func = &flip<u8>;
if (elemSize == (u32)sizeof(u16))
func = &flip<u16>;
if (elemSize == (u32)sizeof(u32))
func = &flip<u32>;
if (elemSize == (u32)sizeof(u8) * 3)
func = &flip3<u8>;
if (func == NULL)
return;
func(size,
srcBase, srcStride,
dstBase, dstStride,
flipMode);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)flipMode;
(void)elemSize;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,195 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename T, int elsize> struct vtail
{
static inline void inRange(const T *, const T *, const T *,
u8 *, size_t &, size_t)
{
//do nothing since there couldn't be enough data
}
};
template <typename T> struct vtail<T, 2>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1(dst + x, internal::vmovn(vd));
x+=8;
}
}
};
template <typename T> struct vtail<T, 1>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1q(dst + x, vd);
x+=16;
}
if( x + 8 < width)
{
vec64 vs = internal::vld1( src + x);
vec64 vr1 = internal::vld1(rng1 + x);
vec64 vr2 = internal::vld1(rng2 + x);
uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
internal::vst1(dst + x, vd);
x+=8;
}
}
};
template <typename T>
inline void inRangeCheck(const Size2D &_size,
const T * srcBase, ptrdiff_t srcStride,
const T * rng1Base, ptrdiff_t rng1Stride,
const T * rng2Base, ptrdiff_t rng2Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
Size2D size(_size);
if (srcStride == dstStride &&
srcStride == rng1Stride &&
srcStride == rng2Stride &&
srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width & ~( 32/sizeof(T) - 1 );
for(size_t j = 0; j < size.height; ++j)
{
const T * src = internal::getRowPtr( srcBase, srcStride, j);
const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
u8 * dst = internal::getRowPtr( dstBase, dstStride, j);
size_t i = 0;
for( ; i < width; i += 32/sizeof(T) )
{
internal::prefetch(src + i);
internal::prefetch(rng1 + i);
internal::prefetch(rng2 + i);
vec128 vs = internal::vld1q( src + i);
vec128 vr1 = internal::vld1q(rng1 + i);
vec128 vr2 = internal::vld1q(rng2 + i);
uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vs = internal::vld1q( src + i + 16/sizeof(T));
vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vnst(dst + i, vd1, vd2);
}
vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
for( ; i < size.width; i++ )
dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
}
}
}
#define INRANGEFUNC(T) \
void inRange(const Size2D &_size, \
const T * srcBase, ptrdiff_t srcStride, \
const T * rng1Base, ptrdiff_t rng1Stride, \
const T * rng2Base, ptrdiff_t rng2Stride, \
u8 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
inRangeCheck(_size, srcBase, srcStride, \
rng1Base, rng1Stride, rng2Base, rng2Stride, \
dstBase, dstStride); \
}
#else
#define INRANGEFUNC(T) \
void inRange(const Size2D &, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
u8 *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
INRANGEFUNC(u8)
INRANGEFUNC(s8)
INRANGEFUNC(u16)
INRANGEFUNC(s16)
INRANGEFUNC(s32)
INRANGEFUNC(f32)
} // namespace CAROTENE_NS

View File

@ -0,0 +1,238 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
void integral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumBase, ptrdiff_t sumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint32x4_t v_zero = vmovq_n_u32(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
uint32x4_t prev = v_zero;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
uint32x4_t vsumh = vaddw_u16(prev, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
sum = internal::getRowPtr(sumBase, sumStride, i);
prev = v_zero;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint32x4_t vsuml = vld1q_u32(prevSum + j);
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
vsuml = vaddq_u32(vsuml, prev);
vsumh = vaddq_u32(vsumh, prev);
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
vsumh = vaddw_u16(vsumh, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]) + prevSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sumBase;
(void)sumStride;
#endif
}
void sqrIntegral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sqsumBase, ptrdiff_t sqsumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint16x8_t v_zero8 = vmovq_n_u16(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
double prev = 0.;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
prev = 0.;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sqsumBase;
(void)sqsumStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,112 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_INTRINSICS_HPP
#define CAROTENE_INTRINSICS_HPP
#include <carotene/definitions.hpp>
#include <arm_neon.h>
namespace CAROTENE_NS { namespace internal {
/////////////// Custom NEON intrinsics ///////////////////
// calculate reciprocal value
inline float32x4_t vrecpq_f32(float32x4_t val)
{
float32x4_t reciprocal = vrecpeq_f32(val);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x2_t vrecp_f32(float32x2_t val)
{
float32x2_t reciprocal = vrecpe_f32(val);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
return reciprocal;
}
// caclulate sqrt value
inline float32x4_t vrsqrtq_f32(float32x4_t val)
{
float32x4_t e = vrsqrteq_f32(val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
return e;
}
inline float32x2_t vrsqrt_f32(float32x2_t val)
{
float32x2_t e = vrsqrte_f32(val);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
return e;
}
inline float32x4_t vsqrtq_f32(float32x4_t val)
{
return vrecpq_f32(vrsqrtq_f32(val));
}
inline float32x2_t vsqrt_f32(float32x2_t val)
{
return vrecp_f32(vrsqrt_f32(val));
}
// table lookup with the table in a 128-bit register
inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
{
#ifdef __aarch64__
// AArch64 supports this natively
return ::vqtbl1_u8(a, b);
#else
union { uint8x16_t v; uint8x8x2_t w; } u = { a };
return vtbl2_u8(u.w, b);
#endif
}
} }
#endif

View File

@ -0,0 +1,713 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
namespace CAROTENE_NS {
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
#ifdef CAROTENE_NEON
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
const uint16x8_t v_zero = vdupq_n_u16(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
uint8x8_t vsub;
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
s16 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border_x3;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
vsub = x1;
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u16(tprev, tcurr, 7);
t1 = tcurr;
t2 = vextq_u16(tcurr, tnext, 1);
// and add them
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
uint8x8_t it0 = vqmovun_s16(tt0);
vst1_u8(drow + x - 8, it0);
vsub = x1;
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue * 3;
else if (border == BORDER_MODE_REPLICATE)
nextx = srow2[x] + srow1[x] + srow0[x];
}
else
{
nextx = (srow2 ? srow2[x + 1] : borderValue) +
srow1[x + 1] +
(srow0 ? srow0[x + 1] : borderValue);
}
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
drow[x] = internal::saturate_cast<u8>((s32)val);
// make shift
prevx = currx;
currx = nextx;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() &&
size.width >= 8 && size.height >= 1 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian1OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t t0, t2;
uint8x8_t xx0 = vmov_n_u8(0x0);
uint8x8_t xx1 = vmov_n_u8(0x0);
uint8x8_t xx2 = vmov_n_u8(0x0);
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
if(x) {
xx0 = xx1;
xx1 = xx2;
} else {
xx1 = x1;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
xx1 = vset_lane_u8(borderValue, x1, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
}
}
xx2 = x1;
if(x) {
tcurr = tnext;
}
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
if(!x) {
tcurr = tnext;
continue;
}
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx;
s16 prevx;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v1[0] : v1[x-1];
nextx = x == cols-1 ? v1[x] : v1[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v1[1] : v1[x-1];
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v1[x-1];
nextx = x == cols-1 ? borderValue : v1[x+1];
}
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian3OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tprev = vmovq_n_s16(0x0);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t tc = vmovq_n_s16(0x0);
int16x8_t t0, t2, tcnext;
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
if(x) {
tprev = tcurr;
tcurr = tnext;
}
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
if(!x) {
tcurr = tnext;
tc = tcnext;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
}
continue;
}
t0 = vextq_s16(tprev, tcurr, 7);
t2 = vextq_s16(tcurr, tnext, 1);
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
tc = tcnext;
t0 = vshlq_n_s16(t0, 1);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx, nextx2;
s16 prevx, prevx2;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v0[0] : v0[x-1];
prevx2 = x == 0 ? v2[0] : v2[x-1];
nextx = x == cols-1 ? v0[x] : v0[x+1];
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v0[1] : v0[x-1];
prevx2 = x == 0 ? v2[1] : v2[x-1];
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v0[x-1];
prevx2 = x == 0 ? borderValue : v2[x-1];
nextx = x == cols-1 ? borderValue : v0[x+1];
nextx2 = x == cols-1 ? borderValue : v2[x+1];
}
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian5OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = 0;
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v3 = 0;
const u8* v4 = 0;
// make border
if (border == BORDER_MODE_REPLICATE) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
} else if (border == BORDER_MODE_REFLECT) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
} else if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tnext, tc, t0;
int16x8_t tnext2, tnext3;
int16x8_t tnext1Old, tnext2Old, tnext3Old;
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
int16x8_t tcurr1 = vmovq_n_s16(0x0);
int16x8_t tnext1 = vmovq_n_s16(0x0);
int16x8_t tprev1 = vmovq_n_s16(0x0);
int16x8_t tpprev1 = vmovq_n_s16(0x0);
int16x8_t tppprev1 = vmovq_n_s16(0x0);
int16x8_t tnext4Old = vmovq_n_s16(0x0);
int16x8_t tnext5Old = vmovq_n_s16(0x0);
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
// do vertical convolution
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
internal::prefetch(v3 + x);
internal::prefetch(v4 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
uint8x8_t x3 = vld1_u8(v3 + x);
uint8x8_t x4 = vld1_u8(v4 + x);
if(x) {
tcurr1 = tnext1;
}
tnext4OldOldOld = tnext4Old;
tnext5OldOldOld = tnext5Old;
tnext1Old = tnext1OldOld;
tnext2Old = tnext2OldOld;
tnext3Old = tnext3OldOld;
tnext4Old = tnext4OldOld;
tnext5Old = tnext5OldOld;
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
tnext3 = vshlq_n_s16(tnext3, 1);
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
tnext2 = vsubq_s16(tc, tnext);
tnext1 = vaddq_s16(tnext3, tnext2);
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
tnext2 = vshlq_n_s16(tnext2, 1);
// tnext2 = 2*x4 - 4*x2 + 2*x0
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
tnext1OldOld = tnext1;
tnext2OldOld = tnext2;
tnext3OldOld = tnext3;
tnext4OldOld = tnext2;
tnext5OldOld = tnext1;
if(x) {
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
tprev1 = tnext3Old;
if(x!=8) {
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
}
}
if(!x) {
// make border
if (border == BORDER_MODE_REPLICATE) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT101) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
} else if (border == BORDER_MODE_CONSTANT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
}
tppprev1 = tprev1;
continue;
}
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
t0 = vaddq_s16(t0, t0);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x >= cols - 1)
x = cols-2;
s16 pprevx = 0;
s16 prevx = 0;
s16 nextx = 0;
s16 nnextx = 0;
for( ; x < cols; x++ )
{
if (x == 0) {
// make border
if (border == BORDER_MODE_REPLICATE) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
prevx = 0;
}
} else if (x == 1) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
}
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else {
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
}
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
if (x == cols-1) {
// make border
if (border == BORDER_MODE_REPLICATE) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_REFLECT) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
} else if (border == BORDER_MODE_REFLECT101) {
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
} else if (border == BORDER_MODE_CONSTANT) {
nextx = 0;
nnextx = 8 * borderValue;
}
} else if (x == cols-2) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
} else if (border == BORDER_MODE_REFLECT101) {
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_CONSTANT) {
nnextx = 8 * borderValue;
}
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
} else {
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
}
s16 res = pprevx + prevx + currx + nextx + nnextx;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,160 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cmath>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct Magnitude
{
typedef s16 type;
void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
int16x8_t & v_dst) const
{
int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
v_src0_p = vget_high_s16(v_src0);
v_src1_p = vget_high_s16(v_src1);
float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
}
void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
int16x4_t & v_dst) const
{
float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
v_dst = vqmovn_s32(v_sqrt);
}
void operator() (const short * src0, const short * src1, short * dst) const
{
f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
}
};
struct MagnitudeF32
{
typedef f32 type;
void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
float32x4_t & v_dst) const
{
v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
}
void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
float32x2_t & v_dst) const
{
v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
}
};
} // namespace
#endif
void magnitude(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
Magnitude());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void magnitude(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
MagnitudeF32());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,163 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cmath>
namespace CAROTENE_NS {
void meanStdDev(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f64 fsum = 0.0f, fsqsum = 0.0f;
sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
void meanStdDev(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
f64 fsum = 0.0f, fsqsum = 0.0f;
f32 arsum[8];
uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw4)
{
size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
v_sum = v_zero;
v_sqsum = v_zero_f;
for ( ; j + 16 < blockSize ; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
// 0
uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
// 1
v_srclo = vmovl_u16(vget_low_u16(v_src1));
v_srchi = vmovl_u16(vget_high_u16(v_src1));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
v_srclo_f = vcvtq_f32_u32(v_srclo);
v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
}
for ( ; j < blockSize; j += 4)
{
uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
float32x4_t v_src_f = vcvtq_f32_u32(v_src);
v_sum = vaddq_u32(v_sum, v_src);
v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
}
vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
vst1q_f32(arsum + 4, v_sqsum);
fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
}
// collect a few last elements in the current row
for ( ; j < size.width; ++j)
{
f32 srcval = src[j];
fsum += srcval;
fsqsum += srcval * srcval;
}
}
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,227 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
/*
* The code here is based on the code in
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
* See also <http://ndevilla.free.fr/median/median/index.html>.
*/
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
{
u8 buf[16+8];
vst1q_u8(buf+cn, r);
for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
return vld1q_u8(buf);
}
uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
{
u8 buf[8+8];
vst1_u8(buf, r);
for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
return vld1_u8(buf+cn);
}
} // namespace
//o------^-------^-----------------------------o 0
// | |
//o--^---v---^---|-------^---------------------o 1
// | | | |
//o--v-------v---|-------|-^-------^-------^---o 2
// | | | | |
//o------^-------v-----^-|-|-------|-------|---o 3
// | | | | | |
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
// | | | | | | |
//o--v-------v---^-|---|---v---|-------|-------o 5
// | | | | |
//o------^-------|-|---v-------|-------v-------o 6
// | | | |
//o--^---v---^---|-v-----------v---------------o 7
// | | |
//o--v-------v---v-----------------------------o 8
#define ELT(num, level) v ## num ## _lv ## level
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
PIX_MIN(a, alvl, b, blvl, newlvl); \
PIX_MAX(a, alvl, b, blvl, newlvl);
#define SORT9 \
PIX_SORT(1, 00, 2, 00, 01); \
PIX_SORT(4, 00, 5, 00, 02); \
PIX_SORT(7, 00, 8, 00, 03); \
PIX_SORT(0, 00, 1, 01, 04); \
PIX_SORT(3, 00, 4, 02, 05); \
PIX_SORT(6, 00, 7, 03, 06); \
PIX_SORT(1, 04, 2, 01, 07); \
PIX_SORT(4, 05, 5, 02, 08); \
PIX_SORT(7, 06, 8, 03, 09); \
PIX_MAX (0, 04, 3, 05, 10); \
PIX_MIN (5, 08, 8, 09, 11); \
PIX_SORT(4, 08, 7, 09, 12); \
PIX_MAX (3, 10, 6, 06, 13); \
PIX_MAX (1, 07, 4, 12, 14); \
PIX_MIN (2, 07, 5, 11, 15); \
PIX_MIN (4, 14, 7, 12, 16); \
PIX_SORT(4, 16, 2, 15, 17); \
PIX_MAX (6, 13, 4, 17, 18); \
PIX_MIN (4, 18, 2, 17, 19);
#endif
bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
{
return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
}
void medianFilter3x3(const Size2D &size, u32 numChannels,
const u8 *srcBase, ptrdiff_t srcStride,
const Margin &srcMargin,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
#ifdef CAROTENE_NEON
u32 cn = numChannels;
size_t colsn = size.width * cn;
for (size_t i = 0; i < size.height; ++i) {
const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
{
uint8x16_t v3_lv00 = vld1q_u8(psrc0);
uint8x16_t v4_lv00 = vld1q_u8(psrc1);
uint8x16_t v5_lv00 = vld1q_u8(psrc2);
uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
goto medianBlur3x3_mainBody;
for (; j < colsn - 16; j += 16) {
internal::prefetch(psrc0 + j);
internal::prefetch(psrc1 + j);
internal::prefetch(psrc2 + j);
v0_lv00 = vld1q_u8(psrc0 + j - cn);
v1_lv00 = vld1q_u8(psrc1 + j - cn);
v2_lv00 = vld1q_u8(psrc2 + j - cn);
v3_lv00 = vld1q_u8(psrc0 + j);
v4_lv00 = vld1q_u8(psrc1 + j);
v5_lv00 = vld1q_u8(psrc2 + j);
v6_lv00 = vld1q_u8(psrc0 + j + cn);
v7_lv00 = vld1q_u8(psrc1 + j + cn);
v8_lv00 = vld1q_u8(psrc2 + j + cn);
medianBlur3x3_mainBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1q_u8(pdst + j, v4_lv19);
}
}
{
size_t k = colsn - 8;
uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
goto medianBlur3x3_tailBody;
for (; k >= j - 8; k -= 8) {
v0_lv00 = vld1_u8(psrc0 + k - cn);
v1_lv00 = vld1_u8(psrc1 + k - cn);
v2_lv00 = vld1_u8(psrc2 + k - cn);
v3_lv00 = vld1_u8(psrc0 + k);
v4_lv00 = vld1_u8(psrc1 + k);
v5_lv00 = vld1_u8(psrc2 + k);
v6_lv00 = vld1_u8(psrc0 + k + cn);
v7_lv00 = vld1_u8(psrc1 + k + cn);
v8_lv00 = vld1_u8(psrc2 + k + cn);
medianBlur3x3_tailBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1_u8(pdst + k, v4_lv19);
}
}
}
#else
(void)size;
(void)numChannels;
(void)srcBase;
(void)srcStride;
(void)srcMargin;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,139 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct Min
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vminq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmin(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::min(src0[0], src1[0]);
}
};
template <typename T>
struct Max
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vmaxq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmax(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::max(src0[0], src1[0]);
}
};
} // namespace
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, op<type>()); \
}
#else
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
IMPL_MINMAX(u8)
IMPL_MINMAX(s8)
IMPL_MINMAX(u16)
IMPL_MINMAX(s16)
IMPL_MINMAX(u32)
IMPL_MINMAX(s32)
IMPL_MINMAX(f32)
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,728 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <algorithm>
#include <limits>
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 16 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
#ifdef CAROTENE_NEON
namespace {
struct ErodeVecOp
{
ErodeVecOp():borderValue(0){}
ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::max();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vminq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmin_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::min(a, b);
}
u8 borderValue;
};
struct DilateVecOp
{
DilateVecOp():borderValue(0){}
DilateVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::min();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vmaxq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmax_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::max(a, b);
}
u8 borderValue;
};
template <typename VecOp>
void morph3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, const VecOp & vop)
{
u8 borderValue = vop.borderValue;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
const uint8x16_t v_zero = vdupq_n_u8(0);
const uint8x16_t v_border = vdupq_n_u8(borderValue);
uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
// perform vertical convolution
for ( ; x <= bwidth; x += 16)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
uint8x16_t x1 = vld1q_u8(srow1 + x);
uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 16 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = vop(srow1[x4],
vop(srow2 ? srow2[x4] : borderValue,
srow0 ? srow0[x4] : borderValue));
currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vop(vop(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u8(tprev, tcurr, 15);
t1 = tcurr;
t2 = vextq_u8(tcurr, tnext, 1);
// and add them
t0 = vop(t0, vop(t1, t2));
vst1q_u8(drow + x - 16, t0);
}
x -= 16;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue;
else if (border == BORDER_MODE_REPLICATE)
nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
}
else
nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
srow0 ? srow0[x + 1] : borderValue),
srow1[x + 1]);
drow[x] = vop(prevx, vop(currx, nextx));
// make shift
prevx = currx;
currx = nextx;
}
}
}
} // namespace
#endif
void erode3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, ErodeVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void dilate3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, DilateVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template<class VecUpdate>
void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
{
size_t i, j, k;
size_t width16 = (width & -16) * cn;
size_t width8 = (width & -8) * cn;
width *= cn;
if (ksize == 1)
{
for (i = 0; i < width; i++)
dst[i] = src[i];
return;
}
ksize = ksize*cn;
VecUpdate updateOp;
switch(cn)
{
case 1:
for (i = 0; i < width16; i += 16)
{
const u8* sptr = src + i;
uint8x16_t s = vld1q_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1q_u8(sptr + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
const u8* sptr = src + i;
uint8x8_t s = vld1_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1_u8(sptr + k));
vst1_u8(dst + i, s);
}
break;
default:
for (i = 0; i < width16; i += 16)
{
uint8x16_t s = vld1q_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1q_u8(src + i + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
uint8x8_t s = vld1_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1_u8(src + i + k));
vst1_u8(dst + i, s);
}
break;
}
ptrdiff_t i0 = i;
for( k = 0; k < (size_t)cn; k++, src++, dst++ )
{
for( i = i0; i <= width - cn*2; i += cn*2 )
{
const u8* s = src + i;
u8 m = s[cn];
for( j = cn*2; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = updateOp(m, s[0]);
dst[i+cn] = updateOp(m, s[j]);
}
for( ; i < width; i += cn )
{
const u8* s = src + i;
u8 m = s[0];
for( j = cn; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = m;
}
}
}
template<class VecUpdate>
void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
{
size_t i, k;
size_t width32 = width & -32;
VecUpdate updateOp;
uint8x16_t x0,x1,s0,s1;
if (ksize == 3)
{
for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
sptr = src[2] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[3] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
}
else if (ksize > 1)
for (; count > 1; count -= 2, dst += dststep*2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 2; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
for (; count > 0; count--, dst += dststep, src++)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[0] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 1; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
vst1q_u8(dst + i, s0);
vst1q_u8(dst + i + 16, s1);
}
for(; i < width; i++ )
{
u8 s = src[0][i];
for( k = 1; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = s;
}
}
}
template <class Op>
inline void morphology(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
//Temporary buffers common for all iterations
std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
u8* srcRow = &_srcRow[0];
size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
std::vector<u8*> _rows(bufRows);
u8** rows = &_rows[0];
// adjust swidthcn so that the used part of buffers stays compact in memory
ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
std::vector<u8> _ringBuf(swidthcn*bufRows+16);
u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
std::vector<ptrdiff_t> _borderTab(borderLength);
ptrdiff_t * borderTab = &_borderTab[0];
std::vector<u8> _constBorderValue;
std::vector<u8> _constBorderRow;
u8 * constBorderValue = NULL;
u8 * constBorderRow = NULL;
if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderValue.resize(borderLength);
constBorderValue = &_constBorderValue[0];
size_t i;
for(i = 0; i < cn; i++)
constBorderValue[i] = borderValues[i];
for(; i < borderLength; i++)
constBorderValue[i] = constBorderValue[i-cn];
if( columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
size_t N = (ssize.width + ksize.width - 1)*cn;
for( i = 0; i < N; i += borderLength )
{
size_t n = std::min( borderLength, N - i );
for(size_t j = 0; j < n; j++)
srcRow[i+j] = constBorderValue[j];
}
MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
}
}
Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
ssize.height + borderMargin.top + borderMargin.bottom);
ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
// recompute border tables
if( dx1 > 0 || dx2 > 0 )
{
if( rowBorderType == BORDER_MODE_CONSTANT )
{
memcpy( srcRow, &constBorderValue[0], dx1*cn );
memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
}
else
{
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
ptrdiff_t wholeWidth = wholeSize.width;
ptrdiff_t i, j;
for( i = 0; i < dx1; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[i*cn + j] = p0 + j;
}
for( i = 0; i < dx2; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[(i + dx1)*cn + j] = p0 + j;
}
}
}
ptrdiff_t startY, startY0, endY, rowCount;
startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
u8* dst = dstBase;
ptrdiff_t width = ssize.width, kwidth = ksize.width;
ptrdiff_t kheight = ksize.height, ay = anchorY;
ptrdiff_t width1 = ssize.width + kwidth - 1;
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
ptrdiff_t dy = 0, i = 0;
src -= xofs1*cn;
ptrdiff_t count = endY - startY;
rowCount = 0;
for(;; dst += dstStride*i, dy += i)
{
ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
dcount = std::min(dcount, count);
count -= dcount;
for( ; dcount-- > 0; src += srcStride )
{
ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
u8* brow = ringBuf + bi*swidthcn;
if( (size_t)(++rowCount) > bufRows )
{
--rowCount;
++startY;
}
memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
if( makeBorder )
{
for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
srcRow[i] = src[borderTab[i]];
for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
}
MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
}
ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
for( i = 0; i < max_i; i++ )
{
ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
wholeSize.height, columnBorderType);
if( srcY < 0 ) // can happen only with constant border type
rows[i] = constBorderRow;
else
{
if( srcY >= startY + rowCount )
break;
ptrdiff_t bi = (srcY - startY0) % bufRows;
rows[i] = ringBuf + bi*swidthcn;
}
}
if( i < kheight )
break;
i -= kheight - 1;
MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
}
}
} // namespace
#endif // CAROTENE_NEON
void erode(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
void dilate(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,539 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <float.h> // For FLT_EPSILON
namespace CAROTENE_NS {
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
/*
* Pyramidal Lucas-Kanade Optical Flow level processing
*/
void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
const u8 *prevData, ptrdiff_t prevStride,
const s16 *prevDerivData, ptrdiff_t prevDerivStride,
const u8 *nextData, ptrdiff_t nextStride,
u32 ptCount,
const f32 *prevPts, f32 *nextPts,
u8 *status, f32 *err,
const Size2D &winSize,
u32 terminationCount, f64 terminationEpsilon,
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
f32 minEigThreshold)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
s32 cn2 = cn*2;
std::vector<s16> _buf(winSize.total()*(cn + cn2));
s16* IWinBuf = &_buf[0];
s32 IWinBufStride = winSize.width*cn;
s16* derivIWinBuf = &_buf[winSize.total()*cn];
s32 derivIWinBufStride = winSize.width*cn2;
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
{
f32 levscale = (1./(1 << level));
u32 ptref = ptidx << 1;
f32 prevPtX = prevPts[ptref+0]*levscale;
f32 prevPtY = prevPts[ptref+1]*levscale;
f32 nextPtX;
f32 nextPtY;
if( level == maxLevel )
{
if( useInitialFlow )
{
nextPtX = nextPts[ptref+0]*levscale;
nextPtY = nextPts[ptref+1]*levscale;
}
else
{
nextPtX = prevPtX;
nextPtY = prevPtY;
}
}
else
{
nextPtX = nextPts[ptref+0]*2.f;
nextPtY = nextPts[ptref+1]*2.f;
}
nextPts[ptref+0] = nextPtX;
nextPts[ptref+1] = nextPtY;
s32 iprevPtX, iprevPtY;
s32 inextPtX, inextPtY;
prevPtX -= halfWinX;
prevPtY -= halfWinY;
iprevPtX = floor(prevPtX);
iprevPtY = floor(prevPtY);
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
{
if( level == 0 )
{
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
}
continue;
}
f32 a = prevPtX - iprevPtX;
f32 b = prevPtY - iprevPtY;
const s32 W_BITS = 14, W_BITS1 = 14;
const f32 FLT_SCALE = 1.f/(1 << 20);
s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
s32 dstep = prevDerivStride/sizeof(s16);
f32 A11 = 0, A12 = 0, A22 = 0;
int16x4_t viw00 = vmov_n_s16((s16)iw00);
int16x4_t viw01 = vmov_n_s16((s16)iw01);
int16x4_t viw10 = vmov_n_s16((s16)iw10);
int16x4_t viw11 = vmov_n_s16((s16)iw11);
float32x4_t vA11 = vmovq_n_f32(0);
float32x4_t vA12 = vmovq_n_f32(0);
float32x4_t vA22 = vmovq_n_f32(0);
s32 wwcn = winSize.width*cn;
// extract the patch from the first image, compute covariation matrix of derivatives
s32 x = 0;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
s16* Iptr = IWinBuf + y*IWinBufStride;
s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
internal::prefetch(src + x + prevStride * 2, 0);
for(x = 0; x <= wwcn - 8; x += 8)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
}
for(; x <= wwcn - 4; x += 4)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
int32x4_t vsuml1 = vmull_s16(vs00, viw00);
int32x4_t vsuml2 = vmull_s16(vs01, viw01);
vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
vst1_s16(Iptr + x, vsumnl);
}
internal::prefetch(dsrc + dstep * 2, 0);
for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
{
#if 0
__asm__ (
"vld2.16 {d0-d1}, [%[dsrc00]] \n\t"
"vld2.16 {d2-d3}, [%[dsrc10]] \n\t"
"vld2.16 {d4-d5}, [%[dsrc01]] \n\t"
"vld2.16 {d6-d7}, [%[dsrc11]] \n\t"
"vmull.s16 q4, d3, %P[viw10] \n\t"
"vmull.s16 q5, d0, %P[viw00] \n\t"
"vmlal.s16 q4, d7, %P[viw11] \n\t"
"vmlal.s16 q5, d4, %P[viw01] \n\t"
"vmlal.s16 q4, d1, %P[viw00] \n\t"
"vmlal.s16 q5, d2, %P[viw10] \n\t"
"vmlal.s16 q4, d5, %P[viw01] \n\t"
"vmlal.s16 q5, d6, %P[viw11] \n\t"
"vrshrn.s32 d13, q4, %[W_BITS1] \n\t"
"vrshrn.s32 d12, q5, %[W_BITS1] \n\t"
"vmull.s16 q3, d13, d13 \n\t"
"vmull.s16 q4, d12, d12 \n\t"
"vmull.s16 q5, d13, d12 \n\t"
"vcvt.f32.s32 q3, q3 \n\t"
"vcvt.f32.s32 q4, q4 \n\t"
"vcvt.f32.s32 q5, q5 \n\t"
"vadd.f32 %q[vA22], q3 \n\t"
"vadd.f32 %q[vA11], q4 \n\t"
"vadd.f32 %q[vA12], q5 \n\t"
"vst2.16 {d12-d13}, [%[out]] \n\t"
: [vA22] "=w" (vA22),
[vA11] "=w" (vA11),
[vA12] "=w" (vA12)
: "0" (vA22),
"1" (vA11),
"2" (vA12),
[out] "r" (dIptr),
[dsrc00] "r" (dsrc),
[dsrc10] "r" (dsrc + dstep),
[dsrc01] "r" (dsrc + cn2),
[dsrc11] "r" (dsrc + dstep + cn2),
[viw00] "w" (viw00),
[viw10] "w" (viw10),
[viw01] "w" (viw01),
[viw11] "w" (viw11),
[W_BITS1] "I" (W_BITS1)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
);
#else
int16x4x2_t vdsrc00 = vld2_s16(dsrc);
int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
int32x4_t va22i = vmull_s16(vsumny, vsumny);
int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
int32x4_t va12i = vmull_s16(vsumnx, vsumny);
float32x4_t va22f = vcvtq_f32_s32(va22i);
float32x4_t va11f = vcvtq_f32_s32(va11i);
float32x4_t va12f = vcvtq_f32_s32(va12i);
vA22 = vaddq_f32(vA22, va22f);
vA11 = vaddq_f32(vA11, va11f);
vA12 = vaddq_f32(vA12, va12f);
int16x4x2_t vsum;
vsum.val[0] = vsumnx;
vsum.val[1] = vsumny;
vst2_s16(dIptr, vsum);
#endif
}
for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
{
s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
dsrc[dstep+cn2+1]*iw11, W_BITS1);
Iptr[x] = (s16)ival;
dIptr[0] = (s16)ixval;
dIptr[1] = (s16)iyval;
A11 += (f32)(ixval*ixval);
A12 += (f32)(ixval*iyval);
A22 += (f32)(iyval*iyval);
}
}
f32 A11buf[2], A12buf[2], A22buf[2];
vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
A11 += A11buf[0] + A11buf[1];
A12 += A12buf[0] + A12buf[1];
A22 += A22buf[0] + A22buf[1];
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
f32 D = A11*A22 - A12*A12;
f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
4.f*A12*A12))/(2*winSize.width*winSize.height);
if( err && getMinEigenVals )
err[ptidx] = (f32)minEig;
if( minEig < minEigThreshold || D < FLT_EPSILON )
{
if( level == 0 && status )
status[ptidx] = false;
continue;
}
D = 1.f/D;
nextPtX -= halfWinX;
nextPtY -= halfWinY;
f32 prevDeltaX = 0;
f32 prevDeltaY = 0;
for(u32 j = 0; j < terminationCount; j++ )
{
inextPtX = floor(nextPtX);
inextPtY = floor(nextPtY);
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
{
if( level == 0 && status )
status[ptidx] = false;
break;
}
a = nextPtX - inextPtX;
b = nextPtY - inextPtY;
iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
iw01 = round(a*(1.f - b)*(1 << W_BITS));
iw10 = round((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 b1 = 0, b2 = 0;
viw00 = vmov_n_s16((s16)iw00);
viw01 = vmov_n_s16((s16)iw01);
viw10 = vmov_n_s16((s16)iw10);
viw11 = vmov_n_s16((s16)iw11);
float32x4_t vb1 = vmovq_n_f32(0);
float32x4_t vb2 = vmovq_n_f32(0);
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
x = 0;
internal::prefetch(Jptr, nextStride * 2);
internal::prefetch(Iptr, IWinBufStride/2);
internal::prefetch(dIptr, derivIWinBufStride/2);
for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
{
uint8x8_t vj00 = vld1_u8(Jptr + x);
uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
int16x8_t vI = vld1q_s16(Iptr + x);
int16x8x2_t vDerivI = vld2q_s16(dIptr);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
float32x4_t vb1f = vcvtq_f32_s32(vb1i);
float32x4_t vb2f = vcvtq_f32_s32(vb2i);
vb1 = vaddq_f32(vb1, vb1f);
vb2 = vaddq_f32(vb2, vb2f);
}
for( ; x < wwcn; x++, dIptr += 2 )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
b1 += (f32)(diff*dIptr[0]);
b2 += (f32)(diff*dIptr[1]);
}
}
f32 bbuf[2];
float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
vst1_f32(bbuf, vb);
b1 += bbuf[0];
b2 += bbuf[1];
b1 *= FLT_SCALE;
b2 *= FLT_SCALE;
f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
nextPtX += deltaX;
nextPtY += deltaY;
nextPts[ptref+0] = nextPtX + halfWinX;
nextPts[ptref+1] = nextPtY + halfWinY;
if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
break;
if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
std::abs(deltaY + prevDeltaY) < 0.01 )
{
nextPts[ptref+0] -= deltaX*0.5f;
nextPts[ptref+1] -= deltaY*0.5f;
break;
}
prevDeltaX = deltaX;
prevDeltaY = deltaY;
}
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
{
f32 nextPointX = nextPts[ptref+0] - halfWinX;
f32 nextPointY = nextPts[ptref+1] - halfWinY;
s32 inextPointX = floor(nextPointX);
s32 inextPointY = floor(nextPointY);
if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
{
if( status )
status[ptidx] = false;
continue;
}
f32 aa = nextPointX - inextPointX;
f32 bb = nextPointY - inextPointY;
iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
iw10 = round((1.f - aa)*bb*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 errval = 0.f;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
for( x = 0; x < wwcn; x++ )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
errval += std::abs((f32)diff);
}
}
err[ptidx] = errval / (32*wwcn*winSize.height);
}
}
#else
(void)size;
(void)cn;
(void)prevData;
(void)prevStride;
(void)prevDerivData;
(void)prevDerivStride;
(void)nextData;
(void)nextStride;
(void)prevPts;
(void)nextPts;
(void)status;
(void)err;
(void)winSize;
(void)terminationCount;
(void)terminationEpsilon;
(void)level;
(void)maxLevel;
(void)useInitialFlow;
(void)getMinEigenVals;
(void)minEigThreshold;
(void)ptCount;
#endif
}
}//CAROTENE_NS

View File

@ -0,0 +1,273 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cfloat>
#include <cmath>
#include "common.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
#define FASTATAN2CONST(scale) \
f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \
P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \
P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \
P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
A_90((f32)(90.f * scale)), \
A_180((f32)(180.f * scale)), \
A_360((f32)(360.f * scale)); \
float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
_90(vdupq_n_f32(A_90)), \
_180(vdupq_n_f32(A_180)), \
_360(vdupq_n_f32(A_360)), \
z(vdupq_n_f32(0.0f)), \
p1(vdupq_n_f32(P1)), \
p3(vdupq_n_f32(P3)), \
p5(vdupq_n_f32(P5)), \
p7(vdupq_n_f32(P7));
#define FASTATAN2SCALAR(y, x, a) \
{ \
f32 ax = std::abs(x), ay = std::abs(y); \
f32 c, c2; \
if (ax >= ay) \
{ \
c = ay / (ax + (float)DBL_EPSILON); \
c2 = c * c; \
a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
else \
{ \
c = ax / (ay + (float)DBL_EPSILON); \
c2 = c * c; \
a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
if (x < 0) \
a = A_180 - a; \
if (y < 0) \
a = A_360 - a; \
}
#define FASTATAN2VECTOR(v_y, v_x, a) \
{ \
float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
float32x4_t c2 = vmulq_f32(c, c); \
a = vmulq_f32(c2, p7); \
\
a = vmulq_f32(vaddq_f32(a, p5), c2); \
a = vmulq_f32(vaddq_f32(a, p3), c2); \
a = vmulq_f32(vaddq_f32(a, p1), c); \
\
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
\
}
} // namespace
#endif
void phase(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(256.0f / 360.0f)
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
// 0
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
// 1
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
vmovn_u16(v_dst16s1)));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vld1q_s16(src1 + j);
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
vst1_u8(dst + j, vmovn_u16(v_dst));
}
for (; j < size.width; j++)
{
f32 x = src0[j], y = src1[j];
f32 a;
FASTATAN2SCALAR(y, x, a)
dst[j] = (u8)(s32)floor(a + 0.5f);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void phase(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(scale)
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw8; j += 8)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
float32x4_t v_dst32f;
// 0
FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
// 1
FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
vst1q_f32(dst + j + 4, v_dst32f);
}
if(j + 4 <= size.width)
{
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
float32x4_t v_dst32f;
FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
j += 4;
}
for (; j < size.width; j++)
{
f32 a;
FASTATAN2SCALAR(src1[j], src0[j], a)
dst[j] = a;
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,460 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
void reduceColSum(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memset(dstBase, 0, size.width*sizeof(s32));
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
int32x4_t sll = vmovq_n_s32(0);
int32x4_t slh = vmovq_n_s32(0);
int32x4_t shl = vmovq_n_s32(0);
int32x4_t shh = vmovq_n_s32(0);
for (size_t h = 0; h < size.height; h += 256)
{
size_t lim = std::min(h + 256, size.height);
uint16x8_t sl = vmovq_n_u16(0);
uint16x8_t sh = vmovq_n_u16(0);
for (size_t k = h; k < lim; ++k, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v = vld1q_u8(src_address);
sl = vaddw_u8(sl, vget_low_u8(v));
sh = vaddw_u8(sh, vget_high_u8(v));
}
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
sll = vqaddq_s32(sll, vsll);
slh = vqaddq_s32(slh, vslh);
shl = vqaddq_s32(shl, vshl);
shh = vqaddq_s32(shh, vshh);
}
vst1q_s32(dstBase + i + 0, sll);
vst1q_s32(dstBase + i + 4, slh);
vst1q_s32(dstBase + i + 8, shl);
vst1q_s32(dstBase + i + 12, shh);
}
for(size_t h = 0; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
dstBase[j] = 0x7fFFffFF;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vmaxq_u8(s1, v1);
s2 = vmaxq_u8(s2, v2);
s3 = vmaxq_u8(s3, v3);
s4 = vmaxq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vmaxq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vminq_u8(s1, v1);
s2 = vminq_u8(s2, v2);
s3 = vminq_u8(s3, v3);
s4 = vminq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vminq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColSum(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vaddq_f32(s1, v1);
s2 = vaddq_f32(s2, v2);
s3 = vaddq_f32(s3, v3);
s4 = vaddq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vaddq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
dstBase[j] += srcBase[j + srcstep * h];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vmaxq_f32(s1, v1);
s2 = vmaxq_f32(s2, v2);
s3 = vmaxq_f32(s3, v3);
s4 = vmaxq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vmaxq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vminq_f32(s1, v1);
s2 = vminq_f32(s2, v2);
s3 = vminq_f32(s3, v3);
s4 = vminq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vminq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,694 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace internal {
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
dst_row[x] = srcBase[map_row[x]];
}
}
}
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
s32 src_idx = map_row[x];
dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
}
}
}
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s32 src00_index = map_row[(x << 2)];
s32 src10_index = map_row[(x << 2) + 2];
f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
srcBase[src00_index];
f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
srcBase[src10_index];
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
} // namespace internal
#endif // CAROTENE_NEON
bool isRemapNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isRemapLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
int32x2_t v_step2 = vdup_n_s32(srcStride);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
int32x2_t v_zero2 = vdup_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
int32x2_t v_m1_2 = vdup_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
float32x2_t v_zero2 = vdup_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void remapLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[x << 1] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1)] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,85 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_REMAP_HPP
#define CAROTENE_SRC_REMAP_HPP
#include "common.hpp"
#include <cmath>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
enum
{
BLOCK_SIZE = 32
};
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride);
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride);
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
} }
#endif // CAROTENE_NEON
#endif // CAROTENE_SRC_REMAP_HPP

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,199 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SATURATE_CAST_HPP
#define CAROTENE_SATURATE_CAST_HPP
#include <algorithm>
#include <climits>
#include <cmath>
#if defined _MSC_VER && defined _M_ARM
# include <intrin.h>
#endif
#include <carotene/definitions.hpp>
#include <carotene/types.hpp>
namespace CAROTENE_NS { namespace internal {
#if defined _MSC_VER && defined _M_ARM
__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
{
(void)d;
__emit(0xEEBD); // vcvtr.s32.f64 s0, d0
__emit(0x0B40);
__emit(0xEE10); // vmov r0, s0
__emit(0x0A10);
__emit(0x4770); // bx lr
}
# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
#elif defined CV_ICC || defined __GNUC__
# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
# define CAROTENE_ROUND_FLT(value) { \
union { f32 f; s32 i; } result; \
asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
return result.i; }
# define CAROTENE_ROUND_DBL(value) { \
union {f32 f; s32 i;} __tegra_result; \
asm ( \
"ftosid %0, %P1\n" \
: "=w" (__tegra_result.f) \
: "w" (value) \
); \
return __tegra_result.i; \
}
# else
# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
# endif
#endif
inline s32 round(f32 value)
{
#ifdef CAROTENE_ROUND_FLT
CAROTENE_ROUND_FLT(value)
#else
s32 intpart = (s32)(value);
f32 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
inline s32 round(f64 value)
{
#ifdef CAROTENE_ROUND_DBL
CAROTENE_ROUND_DBL(value)
#else
s32 intpart = (s32)(value);
f64 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
/////////////// saturate_cast (used in image & signal processing) ///////////////////
template<typename _Tp> inline _Tp saturate_cast(u8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f64 v) { return _Tp(v); }
template<> inline u8 saturate_cast<u8>(s8 v) { return (u8)std::max((s32)v, 0); }
template<> inline u8 saturate_cast<u8>(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(s16 v) { return saturate_cast<u8>((s32)v); }
template<> inline u8 saturate_cast<u8>(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(f32 v) { return saturate_cast<u8>(round(v)); }
template<> inline u8 saturate_cast<u8>(f64 v) { return saturate_cast<u8>(round(v)); }
template<> inline s8 saturate_cast<s8>(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(s16 v) { return saturate_cast<s8>((s32)v); }
template<> inline s8 saturate_cast<s8>(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(f32 v) { return saturate_cast<s8>(round(v)); }
template<> inline s8 saturate_cast<s8>(f64 v) { return saturate_cast<s8>(round(v)); }
template<> inline u16 saturate_cast<u16>(s8 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s16 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(f32 v) { return saturate_cast<u16>(round(v)); }
template<> inline u16 saturate_cast<u16>(f64 v) { return saturate_cast<u16>(round(v)); }
template<> inline s16 saturate_cast<s16>(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(f32 v) { return saturate_cast<s16>(round(v)); }
template<> inline s16 saturate_cast<s16>(f64 v) { return saturate_cast<s16>(round(v)); }
template<> inline u32 saturate_cast<u32>(s8 v) { return (u32)std::max(v, (s8)0); }
template<> inline u32 saturate_cast<u32>(s16 v) { return (u32)std::max(v, (s16)0); }
template<> inline u32 saturate_cast<u32>(s32 v) { return (u32)std::max(v, (s32)0); }
template<> inline u32 saturate_cast<u32>(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
template<> inline u32 saturate_cast<u32>(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); }
//OpenCV like f32/f64 -> u32 conversion
//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
template<> inline u32 saturate_cast<u32>(f32 v) { return round(v); }
template<> inline u32 saturate_cast<u32>(f64 v) { return round(v); }
//Negative clipping implementation
//template<> inline u32 saturate_cast<u32>(f32 v) { return saturate_cast<u32>(round(v)); }
//template<> inline u32 saturate_cast<u32>(f64 v) { return saturate_cast<u32>(round(v)); }
template<> inline s32 saturate_cast<s32>(u32 v) { return (s32)std::min(v, (u32)INT_MAX); }
template<> inline s32 saturate_cast<s32>(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
template<> inline s32 saturate_cast<s32>(u64 v) { return (s32)std::min(v, (u64)INT_MAX); }
template<> inline s32 saturate_cast<s32>(f32 v) { return round(v); }
template<> inline s32 saturate_cast<s32>(f64 v) { return round(v); }
template<> inline u64 saturate_cast<u64>(s8 v) { return (u64)std::max(v, (s8)0); }
template<> inline u64 saturate_cast<u64>(s16 v) { return (u64)std::max(v, (s16)0); }
template<> inline u64 saturate_cast<u64>(s32 v) { return (u64)std::max(v, (s32)0); }
template<> inline u64 saturate_cast<u64>(s64 v) { return (u64)std::max(v, (s64)0); }
template<> inline s64 saturate_cast<s64>(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); }
} }
#endif

View File

@ -0,0 +1,219 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return (dx == 0 && dy == 1 &&
isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
(dx == 1 && dy == 0 &&
isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
}
void Scharr3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
static s16 dw[] = {3, 10, 3};
if (dy == 1)
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
3, 1, dw, 0,
border, borderValue, borderMargin);
else
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
1, 3, 0, dw,
border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
void ScharrDeriv(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t colsn = size.width*cn;
size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
std::vector<s16> _tempBuf((delta << 1) + 64);
s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
int16x8_t vc3 = vmovq_n_s16(3);
int16x8_t vc10 = vmovq_n_s16(10);
uint8x8_t v8c10 = vmov_n_u8(10);
for(size_t y = 0; y < size.height; y++ )
{
const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
// do vertical convolution
size_t x = 0;
for( ; x < roiw8; x += 8 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
__asm__ (
"vld1.8 {d0}, [%[src0]] \n\t"
"vld1.8 {d2}, [%[src2]] \n\t"
"vld1.8 {d1}, [%[src1]] \n\t"
"vaddl.u8 q2, d2, d0 \n\t"
"vmull.u8 q3, d1, %[vc10] \n\t"
"vsubl.u8 q4, d2, d0 \n\t"
"vmla.s16 q3, q2, %q[vc3] \n\t"
"vst1.16 {d8-d9}, [%[out1],:128] \n\t"
"vst1.16 {d6-d7}, [%[out0],:128] \n\t"
:
: [out0] "r" (trow0 + x),
[out1] "r" (trow1 + x),
[src0] "r" (srow0 + x),
[src1] "r" (srow1 + x),
[src2] "r" (srow2 + x),
[vc10] "w" (v8c10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
uint8x8_t s0 = vld1_u8(srow0 + x);
uint8x8_t s1 = vld1_u8(srow1 + x);
uint8x8_t s2 = vld1_u8(srow2 + x);
int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
vst1q_s16(trow1 + x, t1);
vst1q_s16(trow0 + x, t0);
#endif
}
for( ; x < colsn; x++ )
{
trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
trow1[x] = (s16)(srow2[x] - srow0[x]);
}
// make border
size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
for( s32 k = 0; k < cn; k++ )
{
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
}
// do horizontal convolution, interleave the results and store them to dst
x = 0;
for( ; x < roiw8; x += 8 )
{
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
__asm__ (
"vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
"vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
"vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
"vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
"vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
"vadd.i16 q7, q2, q4 \n\t"
"vmul.s16 q6, q3, %q[vc10] \n\t"
"vsub.s16 q5, q1, q0 \n\t"
"vmla.s16 q6, q7, %q[vc3] \n\t"
"vst2.16 {d10-d13}, [%[out]] \n\t"
:
: [out] "r" (drow + x * 2),
[s0ptr] "r" (trow0 + x - cn),
[s1ptr] "r" (trow0 + x + cn),
[s2ptr] "r" (trow1 + x - cn),
[s3ptr] "r" (trow1 + x),
[s4ptr] "r" (trow1 + x + cn),
[vc10] "w" (vc10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
int16x8_t s0 = vld1q_s16(trow0 + x - cn);
int16x8_t s1 = vld1q_s16(trow0 + x + cn);
int16x8_t s2 = vld1q_s16(trow1 + x - cn);
int16x8_t s3 = vld1q_s16(trow1 + x);
int16x8_t s4 = vld1q_s16(trow1 + x + cn);
int16x8_t s3x10 = vmulq_s16(s3, vc10);
int16x8_t s24 = vaddq_s16(s2, s4);
int16x8x2_t vr;
vr.val[0] = vsubq_s16(s1, s0);
vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
vst2q_s16(drow + x*2, vr);
#endif
}
for( ; x < colsn; x++ )
{
drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
}
}
#else
(void)size;
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,109 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "separable_filter.hpp"
namespace CAROTENE_NS {
bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return isSupportedConfiguration() &&
size.width >= 9 && size.height >= 1 &&
(size.height + borderMargin.top + borderMargin.bottom) >= 2 &&
(dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void SeparableFilter3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
#ifdef CAROTENE_NEON
if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
std::abort();//Couldn't call generic filter without provided weights
typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
const s16*, const s16*, BORDER_MODE, u8, Margin);
static sepFilter3x3_8u16s_func quickFilters[4][4]=
{
/*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_121>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_121>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_121>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
/*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_m101>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_m101>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_m101>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
/*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_1m21>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_1m21>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_1m21>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
/*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16Generic>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16Generic>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16Generic>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
};
quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
xw, yw, border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)xw;
(void)yw;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,317 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy, Margin borderMargin)
{
return dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
}
void Sobel3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
dx, dy, 0, 0,
borderType, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy)
{
return isSupportedConfiguration() &&
dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
size.width >= 4 && size.height >= 2 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void Sobel3x3(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, f32 borderValue)
{
internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
#ifdef CAROTENE_NEON
std::vector<f32> _tmp;
f32 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(size.width + 2, borderValue);
tmp = &_tmp[1];
}
ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
std::vector<f32> _tempBuf((delta << 1) + 64);
f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
for( size_t y = 0; y < size.height; y++ )
{
const f32* srow0;
const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const f32* srow2;
f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
if (borderType == BORDER_MODE_REFLECT101) {
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
} else if (borderType == BORDER_MODE_CONSTANT) {
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
}
float32x4_t tprev = vmovq_n_f32(0.f);
float32x4_t tcurr = vmovq_n_f32(0.f);
float32x4_t tnext = vmovq_n_f32(0.f);
float32x4_t t0, t1, t2;
// do vertical convolution
size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
for( ; x <= bcolsn; x += 4 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
float32x4_t x0 = vld1q_f32(srow0 + x);
float32x4_t x1 = vld1q_f32(srow1 + x);
float32x4_t x2 = vld1q_f32(srow2 + x);
tprev = tcurr;
tcurr = tnext;
if(!dy)
{
tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
}
else if(dy == 2)
{
tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
}
else
{
tnext = vsubq_f32(x2, x0);
}
if(!x) {
tcurr = tnext;
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
}
else if (borderType == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
}
else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
}
continue;
}
internal::prefetch(trow0 + x);
internal::prefetch(trow1 + x);
t0 = vextq_f32(tprev, tcurr, 3);
t1 = tcurr;
t2 = vextq_f32(tcurr, tnext, 1);
if(!dx)
{
t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
}
else if(dx == 2)
{
t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
}
else
{
t0 = vsubq_f32(t2, t0);
}
if(!(y%2))
{
vst1q_f32(trow0 + x - 4, t0);
}
else
{
vst1q_f32(trow1 + x - 4, t0);
}
}
x -= 4;
if(x == size.width){
x--;
}
f32 prevx = 0, rowx = 0, nextx = 0;
if(!dy)
{
prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 4*borderValue :
srow2[0] + 2*srow1[0] + srow0[0]) );
rowx = srow2[x] + 2*srow1[x] + srow0[x];
}
else if(dy == 2)
{
prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - 2*srow1[0] + srow0[0]) );
rowx = srow2[x] - 2*srow1[x] + srow0[x];
}
else
{
prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - srow0[0]) );
rowx = srow2[x] - srow0[x];
}
for( ; x < size.width; x++ )
{
if(x+1 == size.width) {
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
if(!dy) {
nextx = 4*borderValue;
} else {
nextx = 0.f;
}
} else if (borderType == BORDER_MODE_REFLECT101)
{
if(!dy) {
nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
} else if(dy == 2) {
nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
} else {
nextx = srow2[x-1] - srow0[x-1];
}
} else {
if(!dy) {
nextx = srow2[x] + 2*srow1[x] + srow0[x];
} else if(dy == 2) {
nextx = srow2[x] - 2*srow1[x] + srow0[x];
} else {
nextx = srow2[x] - srow0[x];
}
}
} else {
if(!dy) {
nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
} else if(dy == 2) {
nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
} else {
nextx = srow2[x+1] - srow0[x+1];
}
}
f32 res;
if(dx==1) {
res = nextx - prevx;
} else if(!dx) {
res = prevx + 2*rowx + nextx;
} else {
res = prevx - 2*rowx + nextx;
}
if(!(y%2)) {
*(trow0+x) = res;
} else {
*(trow1+x) = res;
}
prevx = rowx;
rowx = nextx;
}
if(y>0) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(y%2)
*(drow + x1) = trow0[x1];
else
*(drow + x1) = trow1[x1];
}
}
if(y == size.height-1) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(!(y%2))
*(drow1 + x1) = trow0[x1];
else
*(drow1 + x1) = trow1[x1];
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,621 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct SubWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
}
};
template <typename T, typename WT>
struct SubSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
}
};
} // namespace
#endif
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (s16)src0[j] - (s16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10)));
int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11)));
vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
}
for(; j < size.width; j++)
dst[j] = (f32)src0[j] - (f32)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,385 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
bool isSumSupported(u32 channels)
{
return (channels && channels < 5);
}
void sum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
uint32x4_t vs1231 = vdupq_n_u32(0);
uint32x4_t vs3123 = vdupq_n_u32(0);
uint32x4_t vs2312 = vdupq_n_u32(0);
for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
{
internal::prefetch(src + j + 24);
s1 = vaddw_u8(s1, vld1_u8(src + j + 0));
s2 = vaddw_u8(s2, vld1_u8(src + j + 8));
s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
if (i <= width - 8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
{
internal::prefetch(src + 24);
s1 = vaddw_u8(s1, vld1_u8(src + 0));
s2 = vaddw_u8(s2, vld1_u8(src + 8));
s3 = vaddw_u8(s3, vld1_u8(src + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
u32 sum[12];
vst1q_u32(sum+0, vs1231);
vst1q_u32(sum+4, vs2312);
vst1q_u32(sum+8, vs3123);
for (; i < width; i += 3, src += 3)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
}
sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
}
else
{
uint32x4_t vs = vdupq_n_u32(0);
for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for (int j = 8; j < 257 * 8; j += 8)
{
internal::prefetch(src + j);
s1 = vaddw_u8(s1, vld1_u8(src + j));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (i < width - 7)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for(i+=8,src+=8; i < width-7; i+=8,src+=8)
{
internal::prefetch(src);
s1 = vaddw_u8(s1, vld1_u8(src));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (channels == 1)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
u32 s0 = vget_lane_u32(vs1, 0);
for(; i < width; ++i,++src)
s0 += src[0];
sumdst[0] += s0;
}
else if (channels == 4)
{
vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
for(; i < width; i+=4,src+=4)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
sumdst[3] += src[3];
}
}
else//if (channels == 2)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
for(; i < width; i+=2,src+=2)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
void sum(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
float32x4_t vs1231 = vdupq_n_f32(0);
float32x4_t vs2312 = vdupq_n_f32(0);
float32x4_t vs3123 = vdupq_n_f32(0);
for(; i <= width-12; i += 12)
{
internal::prefetch(src + i + 12);
vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
}
f32 s[12];
vst1q_f32(s + 0, vs1231);
vst1q_f32(s + 4, vs2312);
vst1q_f32(s + 8, vs3123);
sumdst[0] += s[0] + s[3] + s[6] + s[9];
sumdst[1] += s[1] + s[4] + s[7] + s[10];
sumdst[2] += s[2] + s[5] + s[8] + s[11];
for( ; i < width; i+=3)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
sumdst[2] += src[i+2];
}
}
else
{
float32x4_t vs = vdupq_n_f32(0);
for(; i <= width-4; i += 4)
{
internal::prefetch(src + i);
vs = vaddq_f32(vs, vld1q_f32(src+i));
}
if (channels == 1)
{
float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0] + s[1];
for( ; i < width; i++)
sumdst[0] += src[i];
}
else if (channels == 4)
{
f32 s[4];
vst1q_f32(s, vs);
sumdst[0] += s[0];
sumdst[1] += s[1];
sumdst[2] += s[2];
sumdst[3] += s[3];
}
else//if (channels == 2)
{
float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0];
sumdst[1] += s[1];
if(i < width)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
bool isSqsumSupported(u32 channels)
{
return (channels && ((4/channels)*channels == 4));
}
void sqsum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, f64 * sqsumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSqsumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width*channels))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width * channels;
size_t blockSize0 = 1 << 23;
size_t roiw8 = width & ~7;
uint32x4_t v_zero = vdupq_n_u32(0u);
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw8)
{
size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
uint32x4_t v_sum = v_zero;
uint32x4_t v_sqsum = v_zero;
for ( ; j < blockSize ; j += 8, src += 8)
{
internal::prefetch(src);
uint8x8_t v_src0 = vld1_u8(src);
uint16x8_t v_src = vmovl_u8(v_src0);
uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
}
u32 arsum[8];
vst1q_u32(arsum, v_sum);
vst1q_u32(arsum + 4, v_sqsum);
sumdst[0] += (f64)arsum[0];
sumdst[1 % channels] += (f64)arsum[1];
sumdst[2 % channels] += (f64)arsum[2];
sumdst[3 % channels] += (f64)arsum[3];
sqsumdst[0] += (f64)arsum[4];
sqsumdst[1 % channels] += (f64)arsum[5];
sqsumdst[2 % channels] += (f64)arsum[6];
sqsumdst[3 % channels] += (f64)arsum[7];
}
// collect a few last elements in the current row
// it's ok to process channels elements per step
// since we could handle 1,2 or 4 channels
// we always have channels-fold amount of elements remaining
for ( ; j < width; j+=channels, src+=channels)
{
for (u32 kk = 0; kk < channels; kk++)
{
u32 srcval = src[kk];
sumdst[kk] += srcval;
sqsumdst[kk] += srcval * srcval;
}
}
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)sqsumdst;
(void)channels;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than
//time for simultaneous 2 lines matching for the same amount of data
bool isMatchTemplateSupported(const Size2D &tmplSize)
{
return isSupportedConfiguration() &&
tmplSize.width >= 8 && // Actually the function could process even shorter templates
// but there will be no NEON optimization in this case
(tmplSize.width * tmplSize.height) <= 256;
}
void matchTemplate(const Size2D &srcSize,
const u8 * srcBase, ptrdiff_t srcStride,
const Size2D &tmplSize,
const u8 * tmplBase, ptrdiff_t tmplStride,
f32 * dstBase, ptrdiff_t dstStride,
bool normalize)
{
internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
#ifdef CAROTENE_NEON
const size_t tmplW = tmplSize.width;
const size_t tmplH = tmplSize.height;
const size_t dstW = srcSize.width - tmplSize.width + 1;
const size_t dstH = srcSize.height - tmplSize.height + 1;
//template correlation part
{
#if ENABLE4LINESMATCHING
const size_t dstroiw4 = dstW & ~3u;
#endif
const size_t dstroiw2 = dstW & ~1u;
const size_t tmplroiw = tmplW & ~7u;
const size_t dstride = dstStride >> 2;
f32 *corr = dstBase;
const u8 *imgrrow = srcBase;
for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
{
size_t c = 0;
#if ENABLE4LINESMATCHING
for(; c < dstroiw4; c+=4)
{
u32 dot[4] = {0, 0, 0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
uint32x4_t vdot2 = vmovq_n_u32(0);
uint32x4_t vdot3 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
vdot2 = vpadalq_u16(vdot2, vd2);
vdot3 = vpadalq_u16(vdot3, vd3);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
dot[2] += tmpl[j] * img[j + c + 2];
dot[3] += tmpl[j] * img[j + c + 3];
}
}
uint32x4_t vdotx = vld1q_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
}
#endif
for(; c < dstroiw2; c+=2)
{
u32 dot[2] = {0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
}
}
uint32x2_t vdotx = vld1_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1);
vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
}
for(; c < dstW; ++c)
{
u32 dot = 0;
uint32x4_t vdot = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg = vld1_u8(img + j + c);
uint16x8_t vd = vmull_u8(vtmpl, vimg);
vdot = vpadalq_u16(vdot, vd);
}
for(; j < tmplW; ++j)
dot += tmpl[j] * img[j + c];
}
u32 wdot[2];
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
dot += wdot[0] + wdot[1];
corr[c] = (f32)dot;
}
}
}
if(normalize)
{
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
size_t iw = srcSize.width+1;
size_t ih = srcSize.height+1;
std::vector<f64> _sqsum(iw*ih);
f64 *sqsum = &_sqsum[0];
memset(sqsum, 0, iw*sizeof(f64));
for(size_t i = 1; i < ih; ++i)
sqsum[iw*i] = 0.;
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
for(size_t i = 0; i < dstH; ++i)
{
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
for(size_t j = 0; j < dstW; ++j)
{
double s2 = sqsum[iw*i + j] +
sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
sqsum[iw*(i + tmplSize.height) + j] -
sqsum[iw*i + j + tmplSize.width];
result[j] /= tn * std::sqrt(s2);
}
}
}
#else
(void)srcSize;
(void)srcBase;
(void)srcStride;
(void)tmplBase;
(void)tmplStride;
(void)dstBase;
(void)dstStride;
(void)normalize;
#endif
}
} // namespace CAROTENE_NS

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,102 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_VROUND_HELPER_HPP
#define CAROTENE_SRC_VROUND_HELPER_HPP
#include "common.hpp"
#include "vtransform.hpp"
#ifdef CAROTENE_NEON
/**
* This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even.
* See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
*/
// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
#define CAROTENE_ROUND_DELTA (12582912.0f)
namespace CAROTENE_NS { namespace internal {
inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtnq_u32_f32(val);
#else
const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
#endif
}
inline uint32x2_t vround_u32_f32(const float32x2_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtn_u32_f32(val);
#else
const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta));
#endif
}
inline int32x4_t vroundq_s32_f32(const float32x4_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtnq_s32_f32(val);
#else
const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
#endif
}
inline int32x2_t vround_s32_f32(const float32x2_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtn_s32_f32(val);
#else
const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta));
#endif
}
} }
#endif // CAROTENE_NEON
#endif

View File

@ -0,0 +1,689 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_VTRANSFORM_HPP
#define CAROTENE_SRC_VTRANSFORM_HPP
#include "common.hpp"
#include <carotene/types.hpp>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
////////////////////////////// Type Traits ///////////////////////
template <typename T, int cn = 1>
struct VecTraits;
template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits<u16, 1> { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s16, 1> { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s32, 1> { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<u32, 1> { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<s64, 1> { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<u64, 1> { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits<u16, 2> { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s16, 2> { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s32, 2> { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<u32, 2> { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<s64, 2> { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 2> { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 3> { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 3> { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 3> { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 3> { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 3> { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 3> { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 4> { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 4> { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 4> { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 4> { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 4> { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 4> { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
////////////////////////////// vld1q ///////////////////////
inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); }
inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); }
inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
////////////////////////////// vld1 ///////////////////////
inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); }
inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); }
inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
////////////////////////////// vld2q ///////////////////////
inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); }
inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); }
inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
////////////////////////////// vld2 ///////////////////////
inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); }
inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); }
inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
////////////////////////////// vld3q ///////////////////////
inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); }
inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); }
inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
////////////////////////////// vld3 ///////////////////////
inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); }
inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); }
inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
////////////////////////////// vld4q ///////////////////////
inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); }
inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); }
inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
////////////////////////////// vld4 ///////////////////////
inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); }
inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); }
inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
////////////////////////////// vst1q ///////////////////////
inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); }
inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); }
inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); }
inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); }
inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); }
inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); }
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
////////////////////////////// vst1 ///////////////////////
inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); }
inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); }
inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); }
inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); }
inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); }
inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); }
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
////////////////////////////// vst2q ///////////////////////
inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); }
inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); }
inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); }
inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); }
inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); }
inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); }
inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
////////////////////////////// vst2 ///////////////////////
inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); }
inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); }
inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); }
inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); }
inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); }
inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); }
inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
////////////////////////////// vst3q ///////////////////////
inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); }
inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); }
inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); }
inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); }
inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); }
inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); }
inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
////////////////////////////// vst3 ///////////////////////
inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); }
inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); }
inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); }
inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); }
inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); }
inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); }
inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
////////////////////////////// vst4q ///////////////////////
inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); }
inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); }
inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); }
inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); }
inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); }
inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); }
inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
////////////////////////////// vst4 ///////////////////////
inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); }
inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); }
inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); }
inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); }
inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); }
inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); }
inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
////////////////////////////// vabdq ///////////////////////
inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); }
inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); }
inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); }
inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); }
inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); }
inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); }
inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
////////////////////////////// vabd ///////////////////////
inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); }
inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); }
inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); }
inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); }
inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); }
inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); }
inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
////////////////////////////// vminq ///////////////////////
inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); }
inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); }
inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); }
inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); }
inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); }
inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); }
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
////////////////////////////// vmin ///////////////////////
inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); }
inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); }
inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); }
inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); }
inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); }
inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); }
inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
////////////////////////////// vmaxq ///////////////////////
inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); }
inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); }
inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); }
inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); }
inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); }
inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); }
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
////////////////////////////// vmax ///////////////////////
inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); }
inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); }
inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); }
inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); }
inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); }
inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); }
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
////////////////////////////// vdupq_n ///////////////////////
inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); }
inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); }
inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
////////////////////////////// vdup_n ///////////////////////
inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); }
inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); }
inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
////////////////////////////// vget_low ///////////////////////
inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); }
inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); }
inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); }
inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); }
inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); }
inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); }
inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
////////////////////////////// vget_high ///////////////////////
inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); }
inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); }
inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); }
inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); }
inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); }
inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); }
inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
////////////////////////////// vcombine ///////////////////////
inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); }
inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); }
inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); }
inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); }
inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); }
inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); }
inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
////////////////////////////// vaddq ///////////////////////
inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); }
inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); }
inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); }
inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); }
inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); }
inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); }
inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
////////////////////////////// vadd ///////////////////////
inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); }
inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); }
inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); }
inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); }
inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); }
inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); }
inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
////////////////////////////// vqaddq ///////////////////////
inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); }
inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); }
inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); }
inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); }
inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); }
inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); }
////////////////////////////// vqadd ///////////////////////
inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); }
inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); }
inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); }
inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); }
inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); }
inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); }
////////////////////////////// vsubq ///////////////////////
inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); }
inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); }
inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); }
inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); }
inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); }
inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); }
inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
////////////////////////////// vsub ///////////////////////
inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); }
inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); }
inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); }
inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); }
inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); }
inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); }
inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
////////////////////////////// vqsubq ///////////////////////
inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); }
inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); }
inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); }
inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); }
inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); }
inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); }
inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); }
inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); }
////////////////////////////// vqsub ///////////////////////
inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); }
inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); }
inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); }
inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); }
inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); }
inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); }
inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); }
inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); }
////////////////////////////// vmull ///////////////////////
inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); }
inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); }
inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); }
inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); }
inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); }
inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); }
////////////////////////////// vrev64q ///////////////////////
inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); }
inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); }
inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); }
inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); }
inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); }
inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); }
inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
////////////////////////////// vrev64 ///////////////////////
inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); }
inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); }
inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); }
inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); }
inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); }
inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); }
inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
////////////////////////////// vceqq ///////////////////////
inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); }
inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); }
inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); }
inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); }
inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); }
inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); }
inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
////////////////////////////// vceq ///////////////////////
inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); }
inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); }
inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); }
inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); }
inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); }
inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); }
inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
////////////////////////////// vcgtq ///////////////////////
inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); }
inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); }
inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); }
inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); }
inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); }
inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); }
inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
////////////////////////////// vcgt ///////////////////////
inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); }
inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); }
inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); }
inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); }
inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); }
inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); }
inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
////////////////////////////// vcgeq ///////////////////////
inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); }
inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); }
inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); }
inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); }
inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); }
inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); }
inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
////////////////////////////// vcge ///////////////////////
inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); }
inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); }
inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); }
inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); }
inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); }
inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); }
inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
////////////////////////////// vandq ///////////////////////
inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); }
inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); }
inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); }
inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); }
inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); }
inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); }
////////////////////////////// vand ///////////////////////
inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); }
inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); }
inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); }
inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); }
inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); }
inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); }
////////////////////////////// vmovn ///////////////////////
inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); }
inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); }
inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); }
inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); }
inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); }
inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); }
////////////////////////////// vqmovn ///////////////////////
inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); }
inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); }
inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); }
inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); }
inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); }
inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); }
////////////////////////////// vmovl ///////////////////////
inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); }
inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); }
inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); }
inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); }
////////////////////////////// vmvnq ///////////////////////
inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); }
inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); }
inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); }
inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); }
inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); }
inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); }
////////////////////////////// vmvn ///////////////////////
inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); }
inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); }
inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); }
inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); }
inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); }
inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); }
////////////////////////////// vbicq ///////////////////////
inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); }
inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); }
inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); }
inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); }
inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); }
inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); }
inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); }
inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); }
////////////////////////////// vbic ///////////////////////
inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); }
inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); }
inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); }
inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); }
inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); }
inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); }
inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); }
inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); }
////////////////////////////// vtransform ///////////////////////
template <typename Op>
void vtransform(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename VecTraits<type>::vec128 vec128;
typedef typename VecTraits<type>::vec64 vec64;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const size_t step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
const size_t step_tail = 8 / sizeof(type);
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
vec128 v_dst;
op(v_src00, v_src10, v_dst);
vst1q(dst + x, v_dst);
op(v_src01, v_src11, v_dst);
vst1q(dst + x + 16 / sizeof(type), v_dst);
}
for( ; x < roiw_tail; x += step_tail )
{
vec64 v_src0 = vld1(src0 + x);
vec64 v_src1 = vld1(src1 + x);
vec64 v_dst;
op(v_src0, v_src1, v_dst);
vst1(dst + x, v_dst);
}
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
} }
#endif // CAROTENE_NEON
#endif

View File

@ -0,0 +1,434 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpAffineLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,464 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,35 @@
if(NOT ANDROID)
message("cpufeatures is ANDROID project")
endif()
ocv_update(OPENCV_CPUFEATURES_TARGET_NAME libcpufeatures)
set(CPUFEATURES_ROOT "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Android cpufeatures project sources (for example, <android-ndk>/sources/android/cpufeatures)")
set(CPUFEATURES_INCLUDE_DIRS ${CPUFEATURES_ROOT} CACHE INTERNAL "")
set(CPUFEATURES_LIBRARIES "${OPENCV_CPUFEATURES_TARGET_NAME}" CACHE INTERNAL "")
if(NOT DEFINED CPUFEATURES_SOURCES)
set(CPUFEATURES_SOURCES ${CPUFEATURES_ROOT}/cpu-features.c ${CPUFEATURES_ROOT}/cpu-features.h)
endif()
include_directories(${CPUFEATURES_INCLUDE_DIRS})
add_library(${OPENCV_CPUFEATURES_TARGET_NAME} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${CPUFEATURES_SOURCES})
set_target_properties(${OPENCV_CPUFEATURES_TARGET_NAME}
PROPERTIES OUTPUT_NAME cpufeatures
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
COMPILE_PDB_NAME cpufeatures
COMPILE_PDB_NAME_DEBUG "cpufeatures${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${OPENCV_CPUFEATURES_TARGET_NAME} PROPERTIES FOLDER "3rdparty")
endif()
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(${OPENCV_CPUFEATURES_TARGET_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev OPTIONAL)
endif()
ocv_install_3rdparty_licenses(cpufeatures LICENSE README.md)

View File

@ -0,0 +1,13 @@
Copyright (C) 2016 The Android Open Source Project
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,4 @@
The Android NDK provides a small library named cpufeatures that your app can use at runtime to detect the target device's CPU family and the optional features it supports.
It is designed to work as-is on all official Android platform versions.
https://developer.android.com/ndk/guides/cpu-features.html

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,324 @@
/*
* Copyright (C) 2010 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef CPU_FEATURES_H
#define CPU_FEATURES_H
#include <sys/cdefs.h>
#include <stdint.h>
#include <string.h>
__BEGIN_DECLS
/* A list of valid values returned by android_getCpuFamily().
* They describe the CPU Architecture of the current process.
*/
typedef enum {
ANDROID_CPU_FAMILY_UNKNOWN = 0,
ANDROID_CPU_FAMILY_ARM,
ANDROID_CPU_FAMILY_X86,
ANDROID_CPU_FAMILY_MIPS,
ANDROID_CPU_FAMILY_ARM64,
ANDROID_CPU_FAMILY_X86_64,
ANDROID_CPU_FAMILY_MIPS64,
ANDROID_CPU_FAMILY_MAX /* do not remove */
} AndroidCpuFamily;
/* Return the CPU family of the current process.
*
* Note that this matches the bitness of the current process. I.e. when
* running a 32-bit binary on a 64-bit capable CPU, this will return the
* 32-bit CPU family value.
*/
extern AndroidCpuFamily android_getCpuFamily(void);
/* Return a bitmap describing a set of optional CPU features that are
* supported by the current device's CPU. The exact bit-flags returned
* depend on the value returned by android_getCpuFamily(). See the
* documentation for the ANDROID_CPU_*_FEATURE_* flags below for details.
*/
extern uint64_t android_getCpuFeatures(void);
/* The list of feature flags for ANDROID_CPU_FAMILY_ARM that can be
* recognized by the library (see note below for 64-bit ARM). Value details
* are:
*
* VFPv2:
* CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs
* support these instructions. VFPv2 is a subset of VFPv3 so this will
* be set whenever VFPv3 is set too.
*
* ARMv7:
* CPU supports the ARMv7-A basic instruction set.
* This feature is mandated by the 'armeabi-v7a' ABI.
*
* VFPv3:
* CPU supports the VFPv3-D16 instruction set, providing hardware FPU
* support for single and double precision floating point registers.
* Note that only 16 FPU registers are available by default, unless
* the D32 bit is set too. This feature is also mandated by the
* 'armeabi-v7a' ABI.
*
* VFP_D32:
* CPU VFP optional extension that provides 32 FPU registers,
* instead of 16. Note that ARM mandates this feature is the 'NEON'
* feature is implemented by the CPU.
*
* NEON:
* CPU FPU supports "ARM Advanced SIMD" instructions, also known as
* NEON. Note that this mandates the VFP_D32 feature as well, per the
* ARM Architecture specification.
*
* VFP_FP16:
* Half-width floating precision VFP extension. If set, the CPU
* supports instructions to perform floating-point operations on
* 16-bit registers. This is part of the VFPv4 specification, but
* not mandated by any Android ABI.
*
* VFP_FMA:
* Fused multiply-accumulate VFP instructions extension. Also part of
* the VFPv4 specification, but not mandated by any Android ABI.
*
* NEON_FMA:
* Fused multiply-accumulate NEON instructions extension. Optional
* extension from the VFPv4 specification, but not mandated by any
* Android ABI.
*
* IDIV_ARM:
* Integer division available in ARM mode. Only available
* on recent CPUs (e.g. Cortex-A15).
*
* IDIV_THUMB2:
* Integer division available in Thumb-2 mode. Only available
* on recent CPUs (e.g. Cortex-A15).
*
* iWMMXt:
* Optional extension that adds MMX registers and operations to an
* ARM CPU. This is only available on a few XScale-based CPU designs
* sold by Marvell. Pretty rare in practice.
*
* AES:
* CPU supports AES instructions. These instructions are only
* available for 32-bit applications running on ARMv8 CPU.
*
* CRC32:
* CPU supports CRC32 instructions. These instructions are only
* available for 32-bit applications running on ARMv8 CPU.
*
* SHA2:
* CPU supports SHA2 instructions. These instructions are only
* available for 32-bit applications running on ARMv8 CPU.
*
* SHA1:
* CPU supports SHA1 instructions. These instructions are only
* available for 32-bit applications running on ARMv8 CPU.
*
* PMULL:
* CPU supports 64-bit PMULL and PMULL2 instructions. These
* instructions are only available for 32-bit applications
* running on ARMv8 CPU.
*
* If you want to tell the compiler to generate code that targets one of
* the feature set above, you should probably use one of the following
* flags (for more details, see technical note at the end of this file):
*
* -mfpu=vfp
* -mfpu=vfpv2
* These are equivalent and tell GCC to use VFPv2 instructions for
* floating-point operations. Use this if you want your code to
* run on *some* ARMv6 devices, and any ARMv7-A device supported
* by Android.
*
* Generated code requires VFPv2 feature.
*
* -mfpu=vfpv3-d16
* Tell GCC to use VFPv3 instructions (using only 16 FPU registers).
* This should be generic code that runs on any CPU that supports the
* 'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this.
*
* Generated code requires VFPv3 feature.
*
* -mfpu=vfpv3
* Tell GCC to use VFPv3 instructions with 32 FPU registers.
* Generated code requires VFPv3|VFP_D32 features.
*
* -mfpu=neon
* Tell GCC to use VFPv3 instructions with 32 FPU registers, and
* also support NEON intrinsics (see <arm_neon.h>).
* Generated code requires VFPv3|VFP_D32|NEON features.
*
* -mfpu=vfpv4-d16
* Generated code requires VFPv3|VFP_FP16|VFP_FMA features.
*
* -mfpu=vfpv4
* Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features.
*
* -mfpu=neon-vfpv4
* Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA
* features.
*
* -mcpu=cortex-a7
* -mcpu=cortex-a15
* Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|
* NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2
* This flag implies -mfpu=neon-vfpv4.
*
* -mcpu=iwmmxt
* Allows the use of iWMMXt instrinsics with GCC.
*
* IMPORTANT NOTE: These flags should only be tested when
* android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM, i.e. this is a
* 32-bit process.
*
* When running a 64-bit ARM process on an ARMv8 CPU,
* android_getCpuFeatures() will return a different set of bitflags
*/
enum {
ANDROID_CPU_ARM_FEATURE_ARMv7 = (1 << 0),
ANDROID_CPU_ARM_FEATURE_VFPv3 = (1 << 1),
ANDROID_CPU_ARM_FEATURE_NEON = (1 << 2),
ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3),
ANDROID_CPU_ARM_FEATURE_VFPv2 = (1 << 4),
ANDROID_CPU_ARM_FEATURE_VFP_D32 = (1 << 5),
ANDROID_CPU_ARM_FEATURE_VFP_FP16 = (1 << 6),
ANDROID_CPU_ARM_FEATURE_VFP_FMA = (1 << 7),
ANDROID_CPU_ARM_FEATURE_NEON_FMA = (1 << 8),
ANDROID_CPU_ARM_FEATURE_IDIV_ARM = (1 << 9),
ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10),
ANDROID_CPU_ARM_FEATURE_iWMMXt = (1 << 11),
ANDROID_CPU_ARM_FEATURE_AES = (1 << 12),
ANDROID_CPU_ARM_FEATURE_PMULL = (1 << 13),
ANDROID_CPU_ARM_FEATURE_SHA1 = (1 << 14),
ANDROID_CPU_ARM_FEATURE_SHA2 = (1 << 15),
ANDROID_CPU_ARM_FEATURE_CRC32 = (1 << 16),
};
/* The bit flags corresponding to the output of android_getCpuFeatures()
* when android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM64. Value details
* are:
*
* FP:
* CPU has Floating-point unit.
*
* ASIMD:
* CPU has Advanced SIMD unit.
*
* AES:
* CPU supports AES instructions.
*
* CRC32:
* CPU supports CRC32 instructions.
*
* SHA2:
* CPU supports SHA2 instructions.
*
* SHA1:
* CPU supports SHA1 instructions.
*
* PMULL:
* CPU supports 64-bit PMULL and PMULL2 instructions.
*/
enum {
ANDROID_CPU_ARM64_FEATURE_FP = (1 << 0),
ANDROID_CPU_ARM64_FEATURE_ASIMD = (1 << 1),
ANDROID_CPU_ARM64_FEATURE_AES = (1 << 2),
ANDROID_CPU_ARM64_FEATURE_PMULL = (1 << 3),
ANDROID_CPU_ARM64_FEATURE_SHA1 = (1 << 4),
ANDROID_CPU_ARM64_FEATURE_SHA2 = (1 << 5),
ANDROID_CPU_ARM64_FEATURE_CRC32 = (1 << 6),
};
/* The bit flags corresponding to the output of android_getCpuFeatures()
* when android_getCpuFamily() returns ANDROID_CPU_FAMILY_X86 or
* ANDROID_CPU_FAMILY_X86_64.
*/
enum {
ANDROID_CPU_X86_FEATURE_SSSE3 = (1 << 0),
ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1),
ANDROID_CPU_X86_FEATURE_MOVBE = (1 << 2),
ANDROID_CPU_X86_FEATURE_SSE4_1 = (1 << 3),
ANDROID_CPU_X86_FEATURE_SSE4_2 = (1 << 4),
ANDROID_CPU_X86_FEATURE_AES_NI = (1 << 5),
ANDROID_CPU_X86_FEATURE_AVX = (1 << 6),
ANDROID_CPU_X86_FEATURE_RDRAND = (1 << 7),
ANDROID_CPU_X86_FEATURE_AVX2 = (1 << 8),
ANDROID_CPU_X86_FEATURE_SHA_NI = (1 << 9),
};
/* The bit flags corresponding to the output of android_getCpuFeatures()
* when android_getCpuFamily() returns ANDROID_CPU_FAMILY_MIPS
* or ANDROID_CPU_FAMILY_MIPS64. Values are:
*
* R6:
* CPU executes MIPS Release 6 instructions natively, and
* supports obsoleted R1..R5 instructions only via kernel traps.
*
* MSA:
* CPU supports Mips SIMD Architecture instructions.
*/
enum {
ANDROID_CPU_MIPS_FEATURE_R6 = (1 << 0),
ANDROID_CPU_MIPS_FEATURE_MSA = (1 << 1),
};
/* Return the number of CPU cores detected on this device. */
extern int android_getCpuCount(void);
/* The following is used to force the CPU count and features
* mask in sandboxed processes. Under 4.1 and higher, these processes
* cannot access /proc, which is the only way to get information from
* the kernel about the current hardware (at least on ARM).
*
* It _must_ be called only once, and before any android_getCpuXXX
* function, any other case will fail.
*
* This function return 1 on success, and 0 on failure.
*/
extern int android_setCpu(int cpu_count,
uint64_t cpu_features);
#ifdef __arm__
/* Retrieve the ARM 32-bit CPUID value from the kernel.
* Note that this cannot work on sandboxed processes under 4.1 and
* higher, unless you called android_setCpuArm() before.
*/
extern uint32_t android_getCpuIdArm(void);
/* An ARM-specific variant of android_setCpu() that also allows you
* to set the ARM CPUID field.
*/
extern int android_setCpuArm(int cpu_count,
uint64_t cpu_features,
uint32_t cpu_id);
#endif
__END_DECLS
#endif /* CPU_FEATURES_H */

View File

@ -0,0 +1,3 @@
downloads/
*.dll
ffmpeg_version.cmake

View File

@ -0,0 +1,79 @@
$url = "https://raw.githubusercontent.com/opencv/opencv_3rdparty/@FFMPEG_BINARIES_COMMIT@/ffmpeg/opencv_videoio_ffmpeg_64.dll"
$expected_md5 = "@FFMPEG_FILE_HASH_BIN64@"
$output = "$PSScriptRoot\@OPENCV_BIN_INSTALL_PATH@\opencv_videoio_ffmpeg@OPENCV_DLLVERSION@_64.dll"
Write-Output ("=" * 120)
try {
Get-content -Path "$PSScriptRoot\@OPENCV_LICENSES_INSTALL_PATH@\ffmpeg-readme.txt" -ErrorAction 'Stop'
} catch {
Write-Output "Refer to OpenCV FFmpeg wrapper readme notes about library usage / licensing details."
}
Write-Output ("=" * 120)
Write-Output ""
if(![System.IO.File]::Exists($output)) {
try {
[io.file]::OpenWrite($output).close()
} catch {
Write-Warning "Unable to write: $output"
if (!([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) {
Write-Warning "Launching with 'Administrator' elevated privileges..."
Pause
Start-Process powershell.exe "-NoProfile -ExecutionPolicy Bypass -File `"$PSCommandPath`"" -Verb RunAs
exit
} else {
Write-Output "FATAL: Unable to write with elevated privileges: $output"
Pause
exit 1
}
}
try {
Write-Output ("Downloading: " + $output)
Import-Module BitsTransfer
$start_time = Get-Date
Start-BitsTransfer -Source $url -Destination $output -ErrorAction 'Stop'
Write-Output "Downloaded in $((Get-Date).Subtract($start_time).Seconds) seconds"
} catch {
$_ # Dump error
try {
Write-Output ("Downloading (second attempt): " + $output)
$start_time = Get-Date
Invoke-WebRequest -Uri $url -OutFile $output
Write-Output "Downloaded in $((Get-Date).Subtract($start_time).Seconds) seconds"
} catch {
Write-Output ("Can't download file: " + $output)
Write-Output ("URL: " + $url)
Write-Output "You need to download this file manually. Stop"
Pause
Exit
}
}
} else {
Write-Output ("File exists: " + $output)
Write-Output ("Downloading is skipped. Remove this file and re-run this script to force downloading.")
}
if(![System.IO.File]::Exists($output)) {
Write-Output ("Destination file not found: " + $output)
Write-Output "Stop"
Pause
Exit
}
try {
$hash = Get-FileHash $output -Algorithm MD5 -ErrorAction 'Stop'
if($hash.Hash -eq $expected_md5) {
Write-Output "MD5 check passed"
} else {
Write-Output ("MD5 : " + $hash.Hash.toLower())
Write-Output ("Expected: " + $expected_md5)
Write-Output "MD5 hash mismatch"
}
} catch {
$_ # Dump error
Write-Output "Can't check MD5 hash (requires PowerShell 4+)"
}
Pause
Write-Output "Exit"

View File

@ -0,0 +1,44 @@
# Binaries branch name: ffmpeg/4.x_20240522
# Binaries were created for OpenCV: 8393885a39dac1e650bf5d0aaff84c04ad8bcdd3
ocv_update(FFMPEG_BINARIES_COMMIT "394dca6ceb3085c979415e6385996b6570e94153")
ocv_update(FFMPEG_FILE_HASH_BIN32 "bdfbd1efb295f3e54c07d2cb7a843bf9")
ocv_update(FFMPEG_FILE_HASH_BIN64 "bfef029900f788480a363d6dc05c4f0e")
ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")
function(download_win_ffmpeg script_var)
set(${script_var} "" PARENT_SCOPE)
set(ids BIN32 BIN64 CMAKE)
set(name_BIN32 "opencv_videoio_ffmpeg.dll")
set(name_BIN64 "opencv_videoio_ffmpeg_64.dll")
set(name_CMAKE "ffmpeg_version.cmake")
set(FFMPEG_DOWNLOAD_DIR "${OpenCV_BINARY_DIR}/3rdparty/ffmpeg")
set(status TRUE)
foreach(id ${ids})
ocv_download(FILENAME ${name_${id}}
HASH ${FFMPEG_FILE_HASH_${id}}
URL
"$ENV{OPENCV_FFMPEG_URL}"
"${OPENCV_FFMPEG_URL}"
"https://raw.githubusercontent.com/opencv/opencv_3rdparty/${FFMPEG_BINARIES_COMMIT}/ffmpeg/"
DESTINATION_DIR ${FFMPEG_DOWNLOAD_DIR}
ID FFMPEG
RELATIVE_URL
STATUS res)
if(NOT res)
set(status FALSE)
endif()
endforeach()
if(status)
set(${script_var} "${FFMPEG_DOWNLOAD_DIR}/ffmpeg_version.cmake" PARENT_SCOPE)
endif()
endfunction()
if(OPENCV_INSTALL_FFMPEG_DOWNLOAD_SCRIPT)
configure_file("${CMAKE_CURRENT_LIST_DIR}/ffmpeg-download.ps1.in" "${CMAKE_BINARY_DIR}/win-install/ffmpeg-download.ps1" @ONLY)
install(FILES "${CMAKE_BINARY_DIR}/win-install/ffmpeg-download.ps1" DESTINATION "." COMPONENT libs)
endif()
ocv_install_3rdparty_licenses(ffmpeg license.txt readme.txt)

View File

@ -0,0 +1,520 @@
Copyright (C) 2001 Fabrice Bellard
FFmpeg is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
FFmpeg is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with FFmpeg; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
==================================================================================
GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
<one line to give the library's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
<signature of Ty Coon>, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!

View File

@ -0,0 +1,37 @@
* On Linux and other Unix flavors OpenCV uses default or user-built ffmpeg/libav libraries.
If user builds ffmpeg/libav from source and wants OpenCV to stay BSD library, not GPL/LGPL,
he/she should use --enabled-shared configure flag and make sure that no GPL components are
enabled (some notable examples are x264 (H264 encoder) and libac3 (Dolby AC3 audio codec)).
See https://www.ffmpeg.org/legal.html for details.
If you want to play very safe and do not want to use FFMPEG at all, regardless of whether it's installed on
your system or not, configure and build OpenCV using CMake with WITH_FFMPEG=OFF flag. OpenCV will then use
AVFoundation (OSX), GStreamer (Linux) or other available backends supported by opencv_videoio module.
There is also our self-contained motion jpeg codec, which you can use without any worries.
It handles CV_FOURCC('M', 'J', 'P', 'G') streams within an AVI container (".avi").
* On Windows OpenCV uses pre-built ffmpeg binaries, built with proper flags (without GPL components) and
wrapped with simple, stable OpenCV-compatible API.
The binaries are opencv_videoio_ffmpeg.dll (version for 32-bit Windows) and
opencv_videoio_ffmpeg_64.dll (version for 64-bit Windows).
The pre-built opencv_videoio_ffmpeg*.dll is:
* LGPL library, not BSD libraries.
* Loaded at runtime by opencv_videoio module.
If it succeeds, ffmpeg can be used to decode/encode videos;
otherwise, other API is used.
FFMPEG build includes support for H264 encoder based on the OpenH264 library.
OpenH264 Video Codec provided by Cisco Systems, Inc.
See https://github.com/cisco/openh264/releases for details and OpenH264 license.
OpenH264 library should be installed separatelly. Downloaded binary file can be placed into global system path
(System32 or SysWOW64) or near application binaries (check documentation of "LoadLibrary" Win32 function from MSDN).
Or you can specify location of binary file via OPENH264_LIBRARY environment variable.
If LGPL/GPL software can not be supplied with your OpenCV-based product, simply exclude
opencv_videoio_ffmpeg*.dll from your distribution; OpenCV will stay fully functional except for the ability to
decode/encode videos using FFMPEG (though, it may still be able to do that using other API,
such as Video for Windows, Windows Media Foundation or our self-contained motion jpeg codec).
See license.txt for the FFMPEG copyright notice and the licensing terms.

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1 @@
Origin: https://github.com/google/flatbuffers/tree/v23.5.9

View File

@ -0,0 +1,68 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_ALLOCATOR_H_
#define FLATBUFFERS_ALLOCATOR_H_
#include "flatbuffers/base.h"
namespace flatbuffers {
// Allocator interface. This is flatbuffers-specific and meant only for
// `vector_downward` usage.
class Allocator {
public:
virtual ~Allocator() {}
// Allocate `size` bytes of memory.
virtual uint8_t *allocate(size_t size) = 0;
// Deallocate `size` bytes of memory at `p` allocated by this allocator.
virtual void deallocate(uint8_t *p, size_t size) = 0;
// Reallocate `new_size` bytes of memory, replacing the old region of size
// `old_size` at `p`. In contrast to a normal realloc, this grows downwards,
// and is intended specifcally for `vector_downward` use.
// `in_use_back` and `in_use_front` indicate how much of `old_size` is
// actually in use at each end, and needs to be copied.
virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
size_t new_size, size_t in_use_back,
size_t in_use_front) {
FLATBUFFERS_ASSERT(new_size > old_size); // vector_downward only grows
uint8_t *new_p = allocate(new_size);
memcpy_downward(old_p, old_size, new_p, new_size, in_use_back,
in_use_front);
deallocate(old_p, old_size);
return new_p;
}
protected:
// Called by `reallocate_downward` to copy memory from `old_p` of `old_size`
// to `new_p` of `new_size`. Only memory of size `in_use_front` and
// `in_use_back` will be copied from the front and back of the old memory
// allocation.
void memcpy_downward(uint8_t *old_p, size_t old_size, uint8_t *new_p,
size_t new_size, size_t in_use_back,
size_t in_use_front) {
memcpy(new_p + new_size - in_use_back, old_p + old_size - in_use_back,
in_use_back);
memcpy(new_p, old_p, in_use_front);
}
};
} // namespace flatbuffers
#endif // FLATBUFFERS_ALLOCATOR_H_

View File

@ -0,0 +1,256 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_ARRAY_H_
#define FLATBUFFERS_ARRAY_H_
#include <cstdint>
#include <memory>
#include "flatbuffers/base.h"
#include "flatbuffers/stl_emulation.h"
#include "flatbuffers/vector.h"
namespace flatbuffers {
// This is used as a helper type for accessing arrays.
template<typename T, uint16_t length> class Array {
// Array<T> can carry only POD data types (scalars or structs).
typedef typename flatbuffers::bool_constant<flatbuffers::is_scalar<T>::value>
scalar_tag;
typedef
typename flatbuffers::conditional<scalar_tag::value, T, const T *>::type
IndirectHelperType;
public:
typedef uint16_t size_type;
typedef typename IndirectHelper<IndirectHelperType>::return_type return_type;
typedef VectorConstIterator<T, return_type, uoffset_t> const_iterator;
typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
// If T is a LE-scalar or a struct (!scalar_tag::value).
static FLATBUFFERS_CONSTEXPR bool is_span_observable =
(scalar_tag::value && (FLATBUFFERS_LITTLEENDIAN || sizeof(T) == 1)) ||
!scalar_tag::value;
FLATBUFFERS_CONSTEXPR uint16_t size() const { return length; }
return_type Get(uoffset_t i) const {
FLATBUFFERS_ASSERT(i < size());
return IndirectHelper<IndirectHelperType>::Read(Data(), i);
}
return_type operator[](uoffset_t i) const { return Get(i); }
// If this is a Vector of enums, T will be its storage type, not the enum
// type. This function makes it convenient to retrieve value with enum
// type E.
template<typename E> E GetEnum(uoffset_t i) const {
return static_cast<E>(Get(i));
}
const_iterator begin() const { return const_iterator(Data(), 0); }
const_iterator end() const { return const_iterator(Data(), size()); }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
const_iterator cbegin() const { return begin(); }
const_iterator cend() const { return end(); }
const_reverse_iterator crbegin() const { return rbegin(); }
const_reverse_iterator crend() const { return rend(); }
// Get a mutable pointer to elements inside this array.
// This method used to mutate arrays of structs followed by a @p Mutate
// operation. For primitive types use @p Mutate directly.
// @warning Assignments and reads to/from the dereferenced pointer are not
// automatically converted to the correct endianness.
typename flatbuffers::conditional<scalar_tag::value, void, T *>::type
GetMutablePointer(uoffset_t i) const {
FLATBUFFERS_ASSERT(i < size());
return const_cast<T *>(&data()[i]);
}
// Change elements if you have a non-const pointer to this object.
void Mutate(uoffset_t i, const T &val) { MutateImpl(scalar_tag(), i, val); }
// The raw data in little endian format. Use with care.
const uint8_t *Data() const { return data_; }
uint8_t *Data() { return data_; }
// Similarly, but typed, much like std::vector::data
const T *data() const { return reinterpret_cast<const T *>(Data()); }
T *data() { return reinterpret_cast<T *>(Data()); }
// Copy data from a span with endian conversion.
// If this Array and the span overlap, the behavior is undefined.
void CopyFromSpan(flatbuffers::span<const T, length> src) {
const auto p1 = reinterpret_cast<const uint8_t *>(src.data());
const auto p2 = Data();
FLATBUFFERS_ASSERT(!(p1 >= p2 && p1 < (p2 + length)) &&
!(p2 >= p1 && p2 < (p1 + length)));
(void)p1;
(void)p2;
CopyFromSpanImpl(flatbuffers::bool_constant<is_span_observable>(), src);
}
protected:
void MutateImpl(flatbuffers::true_type, uoffset_t i, const T &val) {
FLATBUFFERS_ASSERT(i < size());
WriteScalar(data() + i, val);
}
void MutateImpl(flatbuffers::false_type, uoffset_t i, const T &val) {
*(GetMutablePointer(i)) = val;
}
void CopyFromSpanImpl(flatbuffers::true_type,
flatbuffers::span<const T, length> src) {
// Use std::memcpy() instead of std::copy() to avoid performance degradation
// due to aliasing if T is char or unsigned char.
// The size is known at compile time, so memcpy would be inlined.
std::memcpy(data(), src.data(), length * sizeof(T));
}
// Copy data from flatbuffers::span with endian conversion.
void CopyFromSpanImpl(flatbuffers::false_type,
flatbuffers::span<const T, length> src) {
for (size_type k = 0; k < length; k++) { Mutate(k, src[k]); }
}
// This class is only used to access pre-existing data. Don't ever
// try to construct these manually.
// 'constexpr' allows us to use 'size()' at compile time.
// @note Must not use 'FLATBUFFERS_CONSTEXPR' here, as const is not allowed on
// a constructor.
#if defined(__cpp_constexpr)
constexpr Array();
#else
Array();
#endif
uint8_t data_[length * sizeof(T)];
private:
// This class is a pointer. Copying will therefore create an invalid object.
// Private and unimplemented copy constructor.
Array(const Array &);
Array &operator=(const Array &);
};
// Specialization for Array[struct] with access using Offset<void> pointer.
// This specialization used by idl_gen_text.cpp.
template<typename T, uint16_t length, template<typename> class OffsetT>
class Array<OffsetT<T>, length> {
static_assert(flatbuffers::is_same<T, void>::value, "unexpected type T");
public:
typedef const void *return_type;
typedef uint16_t size_type;
const uint8_t *Data() const { return data_; }
// Make idl_gen_text.cpp::PrintContainer happy.
return_type operator[](uoffset_t) const {
FLATBUFFERS_ASSERT(false);
return nullptr;
}
private:
// This class is only used to access pre-existing data.
Array();
Array(const Array &);
Array &operator=(const Array &);
uint8_t data_[1];
};
template<class U, uint16_t N>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U, N> make_span(Array<U, N> &arr)
FLATBUFFERS_NOEXCEPT {
static_assert(
Array<U, N>::is_span_observable,
"wrong type U, only plain struct, LE-scalar, or byte types are allowed");
return span<U, N>(arr.data(), N);
}
template<class U, uint16_t N>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U, N> make_span(
const Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
static_assert(
Array<U, N>::is_span_observable,
"wrong type U, only plain struct, LE-scalar, or byte types are allowed");
return span<const U, N>(arr.data(), N);
}
template<class U, uint16_t N>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<uint8_t, sizeof(U) * N>
make_bytes_span(Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
static_assert(Array<U, N>::is_span_observable,
"internal error, Array<T> might hold only scalars or structs");
return span<uint8_t, sizeof(U) * N>(arr.Data(), sizeof(U) * N);
}
template<class U, uint16_t N>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const uint8_t, sizeof(U) * N>
make_bytes_span(const Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
static_assert(Array<U, N>::is_span_observable,
"internal error, Array<T> might hold only scalars or structs");
return span<const uint8_t, sizeof(U) * N>(arr.Data(), sizeof(U) * N);
}
// Cast a raw T[length] to a raw flatbuffers::Array<T, length>
// without endian conversion. Use with care.
// TODO: move these Cast-methods to `internal` namespace.
template<typename T, uint16_t length>
Array<T, length> &CastToArray(T (&arr)[length]) {
return *reinterpret_cast<Array<T, length> *>(arr);
}
template<typename T, uint16_t length>
const Array<T, length> &CastToArray(const T (&arr)[length]) {
return *reinterpret_cast<const Array<T, length> *>(arr);
}
template<typename E, typename T, uint16_t length>
Array<E, length> &CastToArrayOfEnum(T (&arr)[length]) {
static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
return *reinterpret_cast<Array<E, length> *>(arr);
}
template<typename E, typename T, uint16_t length>
const Array<E, length> &CastToArrayOfEnum(const T (&arr)[length]) {
static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
return *reinterpret_cast<const Array<E, length> *>(arr);
}
template<typename T, uint16_t length>
bool operator==(const Array<T, length> &lhs,
const Array<T, length> &rhs) noexcept {
return std::addressof(lhs) == std::addressof(rhs) ||
(lhs.size() == rhs.size() &&
std::memcmp(lhs.Data(), rhs.Data(), rhs.size() * sizeof(T)) == 0);
}
} // namespace flatbuffers
#endif // FLATBUFFERS_ARRAY_H_

View File

@ -0,0 +1,495 @@
#ifndef FLATBUFFERS_BASE_H_
#define FLATBUFFERS_BASE_H_
// clang-format off
// If activate should be declared and included first.
#if defined(FLATBUFFERS_MEMORY_LEAK_TRACKING) && \
defined(_MSC_VER) && defined(_DEBUG)
// The _CRTDBG_MAP_ALLOC inside <crtdbg.h> will replace
// calloc/free (etc) to its debug version using #define directives.
#define _CRTDBG_MAP_ALLOC
#include <stdlib.h>
#include <crtdbg.h>
// Replace operator new by trace-enabled version.
#define DEBUG_NEW new(_NORMAL_BLOCK, __FILE__, __LINE__)
#define new DEBUG_NEW
#endif
#if !defined(FLATBUFFERS_ASSERT)
#include <assert.h>
#define FLATBUFFERS_ASSERT assert
#elif defined(FLATBUFFERS_ASSERT_INCLUDE)
// Include file with forward declaration
#include FLATBUFFERS_ASSERT_INCLUDE
#endif
#ifndef ARDUINO
#include <cstdint>
#endif
#include <cstddef>
#include <cstdlib>
#include <cstring>
#if defined(ARDUINO) && !defined(ARDUINOSTL_M_H) && defined(__AVR__)
#include <utility.h>
#else
#include <utility>
#endif
#include <string>
#include <type_traits>
#include <vector>
#include <set>
#include <algorithm>
#include <limits>
#include <iterator>
#include <memory>
#if defined(__unix__) && !defined(FLATBUFFERS_LOCALE_INDEPENDENT)
#include <unistd.h>
#endif
#ifdef __ANDROID__
#include <android/api-level.h>
#endif
#if defined(__ICCARM__)
#include <intrinsics.h>
#endif
// Note the __clang__ check is needed, because clang presents itself
// as an older GNUC compiler (4.2).
// Clang 3.3 and later implement all of the ISO C++ 2011 standard.
// Clang 3.4 and later implement all of the ISO C++ 2014 standard.
// http://clang.llvm.org/cxx_status.html
// Note the MSVC value '__cplusplus' may be incorrect:
// The '__cplusplus' predefined macro in the MSVC stuck at the value 199711L,
// indicating (erroneously!) that the compiler conformed to the C++98 Standard.
// This value should be correct starting from MSVC2017-15.7-Preview-3.
// The '__cplusplus' will be valid only if MSVC2017-15.7-P3 and the `/Zc:__cplusplus` switch is set.
// Workaround (for details see MSDN):
// Use the _MSC_VER and _MSVC_LANG definition instead of the __cplusplus for compatibility.
// The _MSVC_LANG macro reports the Standard version regardless of the '/Zc:__cplusplus' switch.
#if defined(__GNUC__) && !defined(__clang__)
#define FLATBUFFERS_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#else
#define FLATBUFFERS_GCC 0
#endif
#if defined(__clang__)
#define FLATBUFFERS_CLANG (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
#else
#define FLATBUFFERS_CLANG 0
#endif
/// @cond FLATBUFFERS_INTERNAL
#if __cplusplus <= 199711L && \
(!defined(_MSC_VER) || _MSC_VER < 1600) && \
(!defined(__GNUC__) || \
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400))
#error A C++11 compatible compiler with support for the auto typing is \
required for FlatBuffers.
#error __cplusplus _MSC_VER __GNUC__ __GNUC_MINOR__ __GNUC_PATCHLEVEL__
#endif
#if !defined(__clang__) && \
defined(__GNUC__) && \
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40600)
// Backwards compatibility for g++ 4.4, and 4.5 which don't have the nullptr
// and constexpr keywords. Note the __clang__ check is needed, because clang
// presents itself as an older GNUC compiler.
#ifndef nullptr_t
const class nullptr_t {
public:
template<class T> inline operator T*() const { return 0; }
private:
void operator&() const;
} nullptr = {};
#endif
#ifndef constexpr
#define constexpr const
#endif
#endif
// The wire format uses a little endian encoding (since that's efficient for
// the common platforms).
#if defined(__s390x__)
#define FLATBUFFERS_LITTLEENDIAN 0
#endif // __s390x__
#if !defined(FLATBUFFERS_LITTLEENDIAN)
#if defined(__GNUC__) || defined(__clang__) || defined(__ICCARM__)
#if (defined(__BIG_ENDIAN__) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
#define FLATBUFFERS_LITTLEENDIAN 0
#else
#define FLATBUFFERS_LITTLEENDIAN 1
#endif // __BIG_ENDIAN__
#elif defined(_MSC_VER)
#if defined(_M_PPC)
#define FLATBUFFERS_LITTLEENDIAN 0
#else
#define FLATBUFFERS_LITTLEENDIAN 1
#endif
#else
#error Unable to determine endianness, define FLATBUFFERS_LITTLEENDIAN.
#endif
#endif // !defined(FLATBUFFERS_LITTLEENDIAN)
#define FLATBUFFERS_VERSION_MAJOR 23
#define FLATBUFFERS_VERSION_MINOR 5
#define FLATBUFFERS_VERSION_REVISION 9
#define FLATBUFFERS_STRING_EXPAND(X) #X
#define FLATBUFFERS_STRING(X) FLATBUFFERS_STRING_EXPAND(X)
namespace flatbuffers {
// Returns version as string "MAJOR.MINOR.REVISION".
const char* FLATBUFFERS_VERSION();
}
#if (!defined(_MSC_VER) || _MSC_VER > 1600) && \
(!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 407)) || \
defined(__clang__)
#define FLATBUFFERS_FINAL_CLASS final
#define FLATBUFFERS_OVERRIDE override
#define FLATBUFFERS_EXPLICIT_CPP11 explicit
#define FLATBUFFERS_VTABLE_UNDERLYING_TYPE : flatbuffers::voffset_t
#else
#define FLATBUFFERS_FINAL_CLASS
#define FLATBUFFERS_OVERRIDE
#define FLATBUFFERS_EXPLICIT_CPP11
#define FLATBUFFERS_VTABLE_UNDERLYING_TYPE
#endif
#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
(!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
(defined(__cpp_constexpr) && __cpp_constexpr >= 200704)
#define FLATBUFFERS_CONSTEXPR constexpr
#define FLATBUFFERS_CONSTEXPR_CPP11 constexpr
#define FLATBUFFERS_CONSTEXPR_DEFINED
#else
#define FLATBUFFERS_CONSTEXPR const
#define FLATBUFFERS_CONSTEXPR_CPP11
#endif
#if (defined(__cplusplus) && __cplusplus >= 201402L) || \
(defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
#define FLATBUFFERS_CONSTEXPR_CPP14 FLATBUFFERS_CONSTEXPR_CPP11
#else
#define FLATBUFFERS_CONSTEXPR_CPP14
#endif
#if (defined(__GXX_EXPERIMENTAL_CXX0X__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
(defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023026)) || \
defined(__clang__)
#define FLATBUFFERS_NOEXCEPT noexcept
#else
#define FLATBUFFERS_NOEXCEPT
#endif
// NOTE: the FLATBUFFERS_DELETE_FUNC macro may change the access mode to
// private, so be sure to put it at the end or reset access mode explicitly.
#if (!defined(_MSC_VER) || _MSC_FULL_VER >= 180020827) && \
(!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 404)) || \
defined(__clang__)
#define FLATBUFFERS_DELETE_FUNC(func) func = delete
#else
#define FLATBUFFERS_DELETE_FUNC(func) private: func
#endif
#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
(!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)) || \
defined(__clang__)
#define FLATBUFFERS_DEFAULT_DECLARATION
#endif
// Check if we can use template aliases
// Not possible if Microsoft Compiler before 2012
// Possible is the language feature __cpp_alias_templates is defined well
// Or possible if the C++ std is C+11 or newer
#if (defined(_MSC_VER) && _MSC_VER > 1700 /* MSVC2012 */) \
|| (defined(__cpp_alias_templates) && __cpp_alias_templates >= 200704) \
|| (defined(__cplusplus) && __cplusplus >= 201103L)
#define FLATBUFFERS_TEMPLATES_ALIASES
#endif
#ifndef FLATBUFFERS_HAS_STRING_VIEW
// Only provide flatbuffers::string_view if __has_include can be used
// to detect a header that provides an implementation
#if defined(__has_include)
// Check for std::string_view (in c++17)
#if __has_include(<string_view>) && (__cplusplus >= 201606 || (defined(_HAS_CXX17) && _HAS_CXX17))
#include <string_view>
namespace flatbuffers {
typedef std::string_view string_view;
}
#define FLATBUFFERS_HAS_STRING_VIEW 1
// Check for std::experimental::string_view (in c++14, compiler-dependent)
#elif __has_include(<experimental/string_view>) && (__cplusplus >= 201411)
#include <experimental/string_view>
namespace flatbuffers {
typedef std::experimental::string_view string_view;
}
#define FLATBUFFERS_HAS_STRING_VIEW 1
// Check for absl::string_view
#elif __has_include("absl/strings/string_view.h") && \
__has_include("absl/base/config.h") && \
(__cplusplus >= 201411)
#include "absl/base/config.h"
#if !defined(ABSL_USES_STD_STRING_VIEW)
#include "absl/strings/string_view.h"
namespace flatbuffers {
typedef absl::string_view string_view;
}
#define FLATBUFFERS_HAS_STRING_VIEW 1
#endif
#endif
#endif // __has_include
#endif // !FLATBUFFERS_HAS_STRING_VIEW
#ifndef FLATBUFFERS_GENERAL_HEAP_ALLOC_OK
// Allow heap allocations to be used
#define FLATBUFFERS_GENERAL_HEAP_ALLOC_OK 1
#endif // !FLATBUFFERS_GENERAL_HEAP_ALLOC_OK
#ifndef FLATBUFFERS_HAS_NEW_STRTOD
// Modern (C++11) strtod and strtof functions are available for use.
// 1) nan/inf strings as argument of strtod;
// 2) hex-float as argument of strtod/strtof.
#if (defined(_MSC_VER) && _MSC_VER >= 1900) || \
(defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)) || \
(defined(__clang__))
#define FLATBUFFERS_HAS_NEW_STRTOD 1
#endif
#endif // !FLATBUFFERS_HAS_NEW_STRTOD
#ifndef FLATBUFFERS_LOCALE_INDEPENDENT
// Enable locale independent functions {strtof_l, strtod_l,strtoll_l,
// strtoull_l}.
#if (defined(_MSC_VER) && _MSC_VER >= 1800) || \
(defined(__ANDROID_API__) && __ANDROID_API__>= 21) || \
(defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700)) && \
(!defined(__Fuchsia__) && !defined(__ANDROID_API__))
#define FLATBUFFERS_LOCALE_INDEPENDENT 1
#else
#define FLATBUFFERS_LOCALE_INDEPENDENT 0
#endif
#endif // !FLATBUFFERS_LOCALE_INDEPENDENT
// Suppress Undefined Behavior Sanitizer (recoverable only). Usage:
// - __suppress_ubsan__("undefined")
// - __suppress_ubsan__("signed-integer-overflow")
#if defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >=7))
#define __suppress_ubsan__(type) __attribute__((no_sanitize(type)))
#elif defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)
#define __suppress_ubsan__(type) __attribute__((no_sanitize_undefined))
#else
#define __suppress_ubsan__(type)
#endif
// This is constexpr function used for checking compile-time constants.
// Avoid `#pragma warning(disable: 4127) // C4127: expression is constant`.
template<typename T> FLATBUFFERS_CONSTEXPR inline bool IsConstTrue(T t) {
return !!t;
}
// Enable C++ attribute [[]] if std:c++17 or higher.
#if ((__cplusplus >= 201703L) \
|| (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
// All attributes unknown to an implementation are ignored without causing an error.
#define FLATBUFFERS_ATTRIBUTE(attr) attr
#define FLATBUFFERS_FALLTHROUGH() [[fallthrough]]
#else
#define FLATBUFFERS_ATTRIBUTE(attr)
#if FLATBUFFERS_CLANG >= 30800
#define FLATBUFFERS_FALLTHROUGH() [[clang::fallthrough]]
#elif FLATBUFFERS_GCC >= 70300
#define FLATBUFFERS_FALLTHROUGH() [[gnu::fallthrough]]
#else
#define FLATBUFFERS_FALLTHROUGH()
#endif
#endif
/// @endcond
/// @file
namespace flatbuffers {
/// @cond FLATBUFFERS_INTERNAL
// Our default offset / size type, 32bit on purpose on 64bit systems.
// Also, using a consistent offset type maintains compatibility of serialized
// offset values between 32bit and 64bit systems.
typedef uint32_t uoffset_t;
typedef uint64_t uoffset64_t;
// Signed offsets for references that can go in both directions.
typedef int32_t soffset_t;
typedef int64_t soffset64_t;
// Offset/index used in v-tables, can be changed to uint8_t in
// format forks to save a bit of space if desired.
typedef uint16_t voffset_t;
typedef uintmax_t largest_scalar_t;
// In 32bits, this evaluates to 2GB - 1
#define FLATBUFFERS_MAX_BUFFER_SIZE std::numeric_limits<::flatbuffers::soffset_t>::max()
#define FLATBUFFERS_MAX_64_BUFFER_SIZE std::numeric_limits<::flatbuffers::soffset64_t>::max()
// The minimum size buffer that can be a valid flatbuffer.
// Includes the offset to the root table (uoffset_t), the offset to the vtable
// of the root table (soffset_t), the size of the vtable (uint16_t), and the
// size of the referring table (uint16_t).
#define FLATBUFFERS_MIN_BUFFER_SIZE sizeof(uoffset_t) + sizeof(soffset_t) + \
sizeof(uint16_t) + sizeof(uint16_t)
// We support aligning the contents of buffers up to this size.
#ifndef FLATBUFFERS_MAX_ALIGNMENT
#define FLATBUFFERS_MAX_ALIGNMENT 32
#endif
/// @brief The length of a FlatBuffer file header.
static const size_t kFileIdentifierLength = 4;
inline bool VerifyAlignmentRequirements(size_t align, size_t min_align = 1) {
return (min_align <= align) && (align <= (FLATBUFFERS_MAX_ALIGNMENT)) &&
(align & (align - 1)) == 0; // must be power of 2
}
#if defined(_MSC_VER)
#pragma warning(disable: 4351) // C4351: new behavior: elements of array ... will be default initialized
#pragma warning(push)
#pragma warning(disable: 4127) // C4127: conditional expression is constant
#endif
template<typename T> T EndianSwap(T t) {
#if defined(_MSC_VER)
#define FLATBUFFERS_BYTESWAP16 _byteswap_ushort
#define FLATBUFFERS_BYTESWAP32 _byteswap_ulong
#define FLATBUFFERS_BYTESWAP64 _byteswap_uint64
#elif defined(__ICCARM__)
#define FLATBUFFERS_BYTESWAP16 __REV16
#define FLATBUFFERS_BYTESWAP32 __REV
#define FLATBUFFERS_BYTESWAP64(x) \
((__REV(static_cast<uint32_t>(x >> 32U))) | (static_cast<uint64_t>(__REV(static_cast<uint32_t>(x)))) << 32U)
#else
#if defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ < 408 && !defined(__clang__)
// __builtin_bswap16 was missing prior to GCC 4.8.
#define FLATBUFFERS_BYTESWAP16(x) \
static_cast<uint16_t>(__builtin_bswap32(static_cast<uint32_t>(x) << 16))
#else
#define FLATBUFFERS_BYTESWAP16 __builtin_bswap16
#endif
#define FLATBUFFERS_BYTESWAP32 __builtin_bswap32
#define FLATBUFFERS_BYTESWAP64 __builtin_bswap64
#endif
if (sizeof(T) == 1) { // Compile-time if-then's.
return t;
} else if (sizeof(T) == 2) {
union { T t; uint16_t i; } u = { t };
u.i = FLATBUFFERS_BYTESWAP16(u.i);
return u.t;
} else if (sizeof(T) == 4) {
union { T t; uint32_t i; } u = { t };
u.i = FLATBUFFERS_BYTESWAP32(u.i);
return u.t;
} else if (sizeof(T) == 8) {
union { T t; uint64_t i; } u = { t };
u.i = FLATBUFFERS_BYTESWAP64(u.i);
return u.t;
} else {
FLATBUFFERS_ASSERT(0);
return t;
}
}
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
template<typename T> T EndianScalar(T t) {
#if FLATBUFFERS_LITTLEENDIAN
return t;
#else
return EndianSwap(t);
#endif
}
template<typename T>
// UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
__suppress_ubsan__("alignment")
T ReadScalar(const void *p) {
return EndianScalar(*reinterpret_cast<const T *>(p));
}
// See https://github.com/google/flatbuffers/issues/5950
#if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#endif
template<typename T>
// UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
__suppress_ubsan__("alignment")
void WriteScalar(void *p, T t) {
*reinterpret_cast<T *>(p) = EndianScalar(t);
}
template<typename T> struct Offset;
template<typename T> __suppress_ubsan__("alignment") void WriteScalar(void *p, Offset<T> t) {
*reinterpret_cast<uoffset_t *>(p) = EndianScalar(t.o);
}
#if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
#pragma GCC diagnostic pop
#endif
// Computes how many bytes you'd have to pad to be able to write an
// "scalar_size" scalar if the buffer had grown to "buf_size" (downwards in
// memory).
__suppress_ubsan__("unsigned-integer-overflow")
inline size_t PaddingBytes(size_t buf_size, size_t scalar_size) {
return ((~buf_size) + 1) & (scalar_size - 1);
}
// Generic 'operator==' with conditional specialisations.
// T e - new value of a scalar field.
// T def - default of scalar (is known at compile-time).
template<typename T> inline bool IsTheSameAs(T e, T def) { return e == def; }
#if defined(FLATBUFFERS_NAN_DEFAULTS) && \
defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
// Like `operator==(e, def)` with weak NaN if T=(float|double).
template<typename T> inline bool IsFloatTheSameAs(T e, T def) {
return (e == def) || ((def != def) && (e != e));
}
template<> inline bool IsTheSameAs<float>(float e, float def) {
return IsFloatTheSameAs(e, def);
}
template<> inline bool IsTheSameAs<double>(double e, double def) {
return IsFloatTheSameAs(e, def);
}
#endif
// Check 'v' is out of closed range [low; high].
// Workaround for GCC warning [-Werror=type-limits]:
// comparison is always true due to limited range of data type.
template<typename T>
inline bool IsOutRange(const T &v, const T &low, const T &high) {
return (v < low) || (high < v);
}
// Check 'v' is in closed range [low; high].
template<typename T>
inline bool IsInRange(const T &v, const T &low, const T &high) {
return !IsOutRange(v, low, high);
}
} // namespace flatbuffers
#endif // FLATBUFFERS_BASE_H_

View File

@ -0,0 +1,199 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_BUFFER_H_
#define FLATBUFFERS_BUFFER_H_
#include <algorithm>
#include "flatbuffers/base.h"
namespace flatbuffers {
// Wrapper for uoffset_t to allow safe template specialization.
// Value is allowed to be 0 to indicate a null object (see e.g. AddOffset).
template<typename T = void> struct Offset {
// The type of offset to use.
typedef uoffset_t offset_type;
offset_type o;
Offset() : o(0) {}
Offset(const offset_type _o) : o(_o) {}
Offset<> Union() const { return o; }
bool IsNull() const { return !o; }
};
// Wrapper for uoffset64_t Offsets.
template<typename T = void> struct Offset64 {
// The type of offset to use.
typedef uoffset64_t offset_type;
offset_type o;
Offset64() : o(0) {}
Offset64(const offset_type offset) : o(offset) {}
Offset64<> Union() const { return o; }
bool IsNull() const { return !o; }
};
// Litmus check for ensuring the Offsets are the expected size.
static_assert(sizeof(Offset<>) == 4, "Offset has wrong size");
static_assert(sizeof(Offset64<>) == 8, "Offset64 has wrong size");
inline void EndianCheck() {
int endiantest = 1;
// If this fails, see FLATBUFFERS_LITTLEENDIAN above.
FLATBUFFERS_ASSERT(*reinterpret_cast<char *>(&endiantest) ==
FLATBUFFERS_LITTLEENDIAN);
(void)endiantest;
}
template<typename T> FLATBUFFERS_CONSTEXPR size_t AlignOf() {
// clang-format off
#ifdef _MSC_VER
return __alignof(T);
#else
#ifndef alignof
return __alignof__(T);
#else
return alignof(T);
#endif
#endif
// clang-format on
}
// Lexicographically compare two strings (possibly containing nulls), and
// return true if the first is less than the second.
static inline bool StringLessThan(const char *a_data, uoffset_t a_size,
const char *b_data, uoffset_t b_size) {
const auto cmp = memcmp(a_data, b_data, (std::min)(a_size, b_size));
return cmp == 0 ? a_size < b_size : cmp < 0;
}
// When we read serialized data from memory, in the case of most scalars,
// we want to just read T, but in the case of Offset, we want to actually
// perform the indirection and return a pointer.
// The template specialization below does just that.
// It is wrapped in a struct since function templates can't overload on the
// return type like this.
// The typedef is for the convenience of callers of this function
// (avoiding the need for a trailing return decltype)
template<typename T> struct IndirectHelper {
typedef T return_type;
typedef T mutable_return_type;
static const size_t element_stride = sizeof(T);
static return_type Read(const uint8_t *p, const size_t i) {
return EndianScalar((reinterpret_cast<const T *>(p))[i]);
}
static mutable_return_type Read(uint8_t *p, const size_t i) {
return reinterpret_cast<mutable_return_type>(
Read(const_cast<const uint8_t *>(p), i));
}
};
// For vector of Offsets.
template<typename T, template<typename> class OffsetT>
struct IndirectHelper<OffsetT<T>> {
typedef const T *return_type;
typedef T *mutable_return_type;
typedef typename OffsetT<T>::offset_type offset_type;
static const offset_type element_stride = sizeof(offset_type);
static return_type Read(const uint8_t *const p, const offset_type i) {
// Offsets are relative to themselves, so first update the pointer to
// point to the offset location.
const uint8_t *const offset_location = p + i * element_stride;
// Then read the scalar value of the offset (which may be 32 or 64-bits) and
// then determine the relative location from the offset location.
return reinterpret_cast<return_type>(
offset_location + ReadScalar<offset_type>(offset_location));
}
static mutable_return_type Read(uint8_t *const p, const offset_type i) {
// Offsets are relative to themselves, so first update the pointer to
// point to the offset location.
uint8_t *const offset_location = p + i * element_stride;
// Then read the scalar value of the offset (which may be 32 or 64-bits) and
// then determine the relative location from the offset location.
return reinterpret_cast<mutable_return_type>(
offset_location + ReadScalar<offset_type>(offset_location));
}
};
// For vector of structs.
template<typename T> struct IndirectHelper<const T *> {
typedef const T *return_type;
typedef T *mutable_return_type;
static const size_t element_stride = sizeof(T);
static return_type Read(const uint8_t *const p, const size_t i) {
// Structs are stored inline, relative to the first struct pointer.
return reinterpret_cast<return_type>(p + i * element_stride);
}
static mutable_return_type Read(uint8_t *const p, const size_t i) {
// Structs are stored inline, relative to the first struct pointer.
return reinterpret_cast<mutable_return_type>(p + i * element_stride);
}
};
/// @brief Get a pointer to the file_identifier section of the buffer.
/// @return Returns a const char pointer to the start of the file_identifier
/// characters in the buffer. The returned char * has length
/// 'flatbuffers::FlatBufferBuilder::kFileIdentifierLength'.
/// This function is UNDEFINED for FlatBuffers whose schema does not include
/// a file_identifier (likely points at padding or the start of a the root
/// vtable).
inline const char *GetBufferIdentifier(const void *buf,
bool size_prefixed = false) {
return reinterpret_cast<const char *>(buf) +
((size_prefixed) ? 2 * sizeof(uoffset_t) : sizeof(uoffset_t));
}
// Helper to see if the identifier in a buffer has the expected value.
inline bool BufferHasIdentifier(const void *buf, const char *identifier,
bool size_prefixed = false) {
return strncmp(GetBufferIdentifier(buf, size_prefixed), identifier,
flatbuffers::kFileIdentifierLength) == 0;
}
/// @cond FLATBUFFERS_INTERNAL
// Helpers to get a typed pointer to the root object contained in the buffer.
template<typename T> T *GetMutableRoot(void *buf) {
if (!buf) return nullptr;
EndianCheck();
return reinterpret_cast<T *>(
reinterpret_cast<uint8_t *>(buf) +
EndianScalar(*reinterpret_cast<uoffset_t *>(buf)));
}
template<typename T, typename SizeT = uoffset_t>
T *GetMutableSizePrefixedRoot(void *buf) {
return GetMutableRoot<T>(reinterpret_cast<uint8_t *>(buf) + sizeof(SizeT));
}
template<typename T> const T *GetRoot(const void *buf) {
return GetMutableRoot<T>(const_cast<void *>(buf));
}
template<typename T, typename SizeT = uoffset_t>
const T *GetSizePrefixedRoot(const void *buf) {
return GetRoot<T>(reinterpret_cast<const uint8_t *>(buf) + sizeof(SizeT));
}
} // namespace flatbuffers
#endif // FLATBUFFERS_BUFFER_H_

View File

@ -0,0 +1,53 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_BUFFER_REF_H_
#define FLATBUFFERS_BUFFER_REF_H_
#include "flatbuffers/base.h"
#include "flatbuffers/verifier.h"
namespace flatbuffers {
// Convenient way to bundle a buffer and its length, to pass it around
// typed by its root.
// A BufferRef does not own its buffer.
struct BufferRefBase {}; // for std::is_base_of
template<typename T> struct BufferRef : BufferRefBase {
BufferRef() : buf(nullptr), len(0), must_free(false) {}
BufferRef(uint8_t *_buf, uoffset_t _len)
: buf(_buf), len(_len), must_free(false) {}
~BufferRef() {
if (must_free) free(buf);
}
const T *GetRoot() const { return flatbuffers::GetRoot<T>(buf); }
bool Verify() {
Verifier verifier(buf, len);
return verifier.VerifyBuffer<T>(nullptr);
}
uint8_t *buf;
uoffset_t len;
bool must_free;
};
} // namespace flatbuffers
#endif // FLATBUFFERS_BUFFER_REF_H_

View File

@ -0,0 +1,64 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_DEFAULT_ALLOCATOR_H_
#define FLATBUFFERS_DEFAULT_ALLOCATOR_H_
#include "flatbuffers/allocator.h"
#include "flatbuffers/base.h"
namespace flatbuffers {
// DefaultAllocator uses new/delete to allocate memory regions
class DefaultAllocator : public Allocator {
public:
uint8_t *allocate(size_t size) FLATBUFFERS_OVERRIDE {
return new uint8_t[size];
}
void deallocate(uint8_t *p, size_t) FLATBUFFERS_OVERRIDE { delete[] p; }
static void dealloc(void *p, size_t) { delete[] static_cast<uint8_t *>(p); }
};
// These functions allow for a null allocator to mean use the default allocator,
// as used by DetachedBuffer and vector_downward below.
// This is to avoid having a statically or dynamically allocated default
// allocator, or having to move it between the classes that may own it.
inline uint8_t *Allocate(Allocator *allocator, size_t size) {
return allocator ? allocator->allocate(size)
: DefaultAllocator().allocate(size);
}
inline void Deallocate(Allocator *allocator, uint8_t *p, size_t size) {
if (allocator)
allocator->deallocate(p, size);
else
DefaultAllocator().deallocate(p, size);
}
inline uint8_t *ReallocateDownward(Allocator *allocator, uint8_t *old_p,
size_t old_size, size_t new_size,
size_t in_use_back, size_t in_use_front) {
return allocator ? allocator->reallocate_downward(old_p, old_size, new_size,
in_use_back, in_use_front)
: DefaultAllocator().reallocate_downward(
old_p, old_size, new_size, in_use_back, in_use_front);
}
} // namespace flatbuffers
#endif // FLATBUFFERS_DEFAULT_ALLOCATOR_H_

View File

@ -0,0 +1,114 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_DETACHED_BUFFER_H_
#define FLATBUFFERS_DETACHED_BUFFER_H_
#include "flatbuffers/allocator.h"
#include "flatbuffers/base.h"
#include "flatbuffers/default_allocator.h"
namespace flatbuffers {
// DetachedBuffer is a finished flatbuffer memory region, detached from its
// builder. The original memory region and allocator are also stored so that
// the DetachedBuffer can manage the memory lifetime.
class DetachedBuffer {
public:
DetachedBuffer()
: allocator_(nullptr),
own_allocator_(false),
buf_(nullptr),
reserved_(0),
cur_(nullptr),
size_(0) {}
DetachedBuffer(Allocator *allocator, bool own_allocator, uint8_t *buf,
size_t reserved, uint8_t *cur, size_t sz)
: allocator_(allocator),
own_allocator_(own_allocator),
buf_(buf),
reserved_(reserved),
cur_(cur),
size_(sz) {}
DetachedBuffer(DetachedBuffer &&other) noexcept
: allocator_(other.allocator_),
own_allocator_(other.own_allocator_),
buf_(other.buf_),
reserved_(other.reserved_),
cur_(other.cur_),
size_(other.size_) {
other.reset();
}
DetachedBuffer &operator=(DetachedBuffer &&other) noexcept {
if (this == &other) return *this;
destroy();
allocator_ = other.allocator_;
own_allocator_ = other.own_allocator_;
buf_ = other.buf_;
reserved_ = other.reserved_;
cur_ = other.cur_;
size_ = other.size_;
other.reset();
return *this;
}
~DetachedBuffer() { destroy(); }
const uint8_t *data() const { return cur_; }
uint8_t *data() { return cur_; }
size_t size() const { return size_; }
// These may change access mode, leave these at end of public section
FLATBUFFERS_DELETE_FUNC(DetachedBuffer(const DetachedBuffer &other));
FLATBUFFERS_DELETE_FUNC(
DetachedBuffer &operator=(const DetachedBuffer &other));
protected:
Allocator *allocator_;
bool own_allocator_;
uint8_t *buf_;
size_t reserved_;
uint8_t *cur_;
size_t size_;
inline void destroy() {
if (buf_) Deallocate(allocator_, buf_, reserved_);
if (own_allocator_ && allocator_) { delete allocator_; }
reset();
}
inline void reset() {
allocator_ = nullptr;
own_allocator_ = false;
buf_ = nullptr;
reserved_ = 0;
cur_ = nullptr;
size_ = 0;
}
};
} // namespace flatbuffers
#endif // FLATBUFFERS_DETACHED_BUFFER_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,273 @@
/*
* Copyright 2014 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_H_
#define FLATBUFFERS_H_
#include <algorithm>
// TODO: These includes are for mitigating the pains of users editing their
// source because they relied on flatbuffers.h to include everything for them.
#include "flatbuffers/array.h"
#include "flatbuffers/base.h"
#include "flatbuffers/buffer.h"
#include "flatbuffers/buffer_ref.h"
#include "flatbuffers/detached_buffer.h"
#include "flatbuffers/flatbuffer_builder.h"
#include "flatbuffers/stl_emulation.h"
#include "flatbuffers/string.h"
#include "flatbuffers/struct.h"
#include "flatbuffers/table.h"
#include "flatbuffers/vector.h"
#include "flatbuffers/vector_downward.h"
#include "flatbuffers/verifier.h"
namespace flatbuffers {
/// @brief This can compute the start of a FlatBuffer from a root pointer, i.e.
/// it is the opposite transformation of GetRoot().
/// This may be useful if you want to pass on a root and have the recipient
/// delete the buffer afterwards.
inline const uint8_t *GetBufferStartFromRootPointer(const void *root) {
auto table = reinterpret_cast<const Table *>(root);
auto vtable = table->GetVTable();
// Either the vtable is before the root or after the root.
auto start = (std::min)(vtable, reinterpret_cast<const uint8_t *>(root));
// Align to at least sizeof(uoffset_t).
start = reinterpret_cast<const uint8_t *>(reinterpret_cast<uintptr_t>(start) &
~(sizeof(uoffset_t) - 1));
// Additionally, there may be a file_identifier in the buffer, and the root
// offset. The buffer may have been aligned to any size between
// sizeof(uoffset_t) and FLATBUFFERS_MAX_ALIGNMENT (see "force_align").
// Sadly, the exact alignment is only known when constructing the buffer,
// since it depends on the presence of values with said alignment properties.
// So instead, we simply look at the next uoffset_t values (root,
// file_identifier, and alignment padding) to see which points to the root.
// None of the other values can "impersonate" the root since they will either
// be 0 or four ASCII characters.
static_assert(flatbuffers::kFileIdentifierLength == sizeof(uoffset_t),
"file_identifier is assumed to be the same size as uoffset_t");
for (auto possible_roots = FLATBUFFERS_MAX_ALIGNMENT / sizeof(uoffset_t) + 1;
possible_roots; possible_roots--) {
start -= sizeof(uoffset_t);
if (ReadScalar<uoffset_t>(start) + start ==
reinterpret_cast<const uint8_t *>(root))
return start;
}
// We didn't find the root, either the "root" passed isn't really a root,
// or the buffer is corrupt.
// Assert, because calling this function with bad data may cause reads
// outside of buffer boundaries.
FLATBUFFERS_ASSERT(false);
return nullptr;
}
/// @brief This return the prefixed size of a FlatBuffer.
template<typename SizeT = uoffset_t>
inline SizeT GetPrefixedSize(const uint8_t *buf) {
return ReadScalar<SizeT>(buf);
}
// Base class for native objects (FlatBuffer data de-serialized into native
// C++ data structures).
// Contains no functionality, purely documentative.
struct NativeTable {};
/// @brief Function types to be used with resolving hashes into objects and
/// back again. The resolver gets a pointer to a field inside an object API
/// object that is of the type specified in the schema using the attribute
/// `cpp_type` (it is thus important whatever you write to this address
/// matches that type). The value of this field is initially null, so you
/// may choose to implement a delayed binding lookup using this function
/// if you wish. The resolver does the opposite lookup, for when the object
/// is being serialized again.
typedef uint64_t hash_value_t;
typedef std::function<void(void **pointer_adr, hash_value_t hash)>
resolver_function_t;
typedef std::function<hash_value_t(void *pointer)> rehasher_function_t;
// Helper function to test if a field is present, using any of the field
// enums in the generated code.
// `table` must be a generated table type. Since this is a template parameter,
// this is not typechecked to be a subclass of Table, so beware!
// Note: this function will return false for fields equal to the default
// value, since they're not stored in the buffer (unless force_defaults was
// used).
template<typename T>
bool IsFieldPresent(const T *table, typename T::FlatBuffersVTableOffset field) {
// Cast, since Table is a private baseclass of any table types.
return reinterpret_cast<const Table *>(table)->CheckField(
static_cast<voffset_t>(field));
}
// Utility function for reverse lookups on the EnumNames*() functions
// (in the generated C++ code)
// names must be NULL terminated.
inline int LookupEnum(const char **names, const char *name) {
for (const char **p = names; *p; p++)
if (!strcmp(*p, name)) return static_cast<int>(p - names);
return -1;
}
// These macros allow us to layout a struct with a guarantee that they'll end
// up looking the same on different compilers and platforms.
// It does this by disallowing the compiler to do any padding, and then
// does padding itself by inserting extra padding fields that make every
// element aligned to its own size.
// Additionally, it manually sets the alignment of the struct as a whole,
// which is typically its largest element, or a custom size set in the schema
// by the force_align attribute.
// These are used in the generated code only.
// clang-format off
#if defined(_MSC_VER)
#define FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(alignment) \
__pragma(pack(1)) \
struct __declspec(align(alignment))
#define FLATBUFFERS_STRUCT_END(name, size) \
__pragma(pack()) \
static_assert(sizeof(name) == size, "compiler breaks packing rules")
#elif defined(__GNUC__) || defined(__clang__) || defined(__ICCARM__)
#define FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(alignment) \
_Pragma("pack(1)") \
struct __attribute__((aligned(alignment)))
#define FLATBUFFERS_STRUCT_END(name, size) \
_Pragma("pack()") \
static_assert(sizeof(name) == size, "compiler breaks packing rules")
#else
#error Unknown compiler, please define structure alignment macros
#endif
// clang-format on
// Minimal reflection via code generation.
// Besides full-fat reflection (see reflection.h) and parsing/printing by
// loading schemas (see idl.h), we can also have code generation for minimal
// reflection data which allows pretty-printing and other uses without needing
// a schema or a parser.
// Generate code with --reflect-types (types only) or --reflect-names (names
// also) to enable.
// See minireflect.h for utilities using this functionality.
// These types are organized slightly differently as the ones in idl.h.
enum SequenceType { ST_TABLE, ST_STRUCT, ST_UNION, ST_ENUM };
// Scalars have the same order as in idl.h
// clang-format off
#define FLATBUFFERS_GEN_ELEMENTARY_TYPES(ET) \
ET(ET_UTYPE) \
ET(ET_BOOL) \
ET(ET_CHAR) \
ET(ET_UCHAR) \
ET(ET_SHORT) \
ET(ET_USHORT) \
ET(ET_INT) \
ET(ET_UINT) \
ET(ET_LONG) \
ET(ET_ULONG) \
ET(ET_FLOAT) \
ET(ET_DOUBLE) \
ET(ET_STRING) \
ET(ET_SEQUENCE) // See SequenceType.
enum ElementaryType {
#define FLATBUFFERS_ET(E) E,
FLATBUFFERS_GEN_ELEMENTARY_TYPES(FLATBUFFERS_ET)
#undef FLATBUFFERS_ET
};
inline const char * const *ElementaryTypeNames() {
static const char * const names[] = {
#define FLATBUFFERS_ET(E) #E,
FLATBUFFERS_GEN_ELEMENTARY_TYPES(FLATBUFFERS_ET)
#undef FLATBUFFERS_ET
};
return names;
}
// clang-format on
// Basic type info cost just 16bits per field!
// We're explicitly defining the signedness since the signedness of integer
// bitfields is otherwise implementation-defined and causes warnings on older
// GCC compilers.
struct TypeCode {
// ElementaryType
unsigned short base_type : 4;
// Either vector (in table) or array (in struct)
unsigned short is_repeating : 1;
// Index into type_refs below, or -1 for none.
signed short sequence_ref : 11;
};
static_assert(sizeof(TypeCode) == 2, "TypeCode");
struct TypeTable;
// Signature of the static method present in each type.
typedef const TypeTable *(*TypeFunction)();
struct TypeTable {
SequenceType st;
size_t num_elems; // of type_codes, values, names (but not type_refs).
const TypeCode *type_codes; // num_elems count
const TypeFunction *type_refs; // less than num_elems entries (see TypeCode).
const int16_t *array_sizes; // less than num_elems entries (see TypeCode).
const int64_t *values; // Only set for non-consecutive enum/union or structs.
const char *const *names; // Only set if compiled with --reflect-names.
};
// String which identifies the current version of FlatBuffers.
inline const char *flatbuffers_version_string() {
return "FlatBuffers " FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MAJOR) "."
FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MINOR) "."
FLATBUFFERS_STRING(FLATBUFFERS_VERSION_REVISION);
}
// clang-format off
#define FLATBUFFERS_DEFINE_BITMASK_OPERATORS(E, T)\
inline E operator | (E lhs, E rhs){\
return E(T(lhs) | T(rhs));\
}\
inline E operator & (E lhs, E rhs){\
return E(T(lhs) & T(rhs));\
}\
inline E operator ^ (E lhs, E rhs){\
return E(T(lhs) ^ T(rhs));\
}\
inline E operator ~ (E lhs){\
return E(~T(lhs));\
}\
inline E operator |= (E &lhs, E rhs){\
lhs = lhs | rhs;\
return lhs;\
}\
inline E operator &= (E &lhs, E rhs){\
lhs = lhs & rhs;\
return lhs;\
}\
inline E operator ^= (E &lhs, E rhs){\
lhs = lhs ^ rhs;\
return lhs;\
}\
inline bool operator !(E rhs) \
{\
return !bool(T(rhs)); \
}
/// @endcond
} // namespace flatbuffers
// clang-format on
#endif // FLATBUFFERS_H_

View File

@ -0,0 +1,513 @@
/*
* Copyright 2017 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_STL_EMULATION_H_
#define FLATBUFFERS_STL_EMULATION_H_
// clang-format off
#include "flatbuffers/base.h"
#include <string>
#include <type_traits>
#include <vector>
#include <memory>
#include <limits>
#ifndef FLATBUFFERS_USE_STD_OPTIONAL
// Detect C++17 compatible compiler.
// __cplusplus >= 201703L - a compiler has support of 'static inline' variables.
#if (defined(__cplusplus) && __cplusplus >= 201703L) \
|| (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
#define FLATBUFFERS_USE_STD_OPTIONAL 1
#else
#define FLATBUFFERS_USE_STD_OPTIONAL 0
#endif // (defined(__cplusplus) && __cplusplus >= 201703L) ...
#endif // FLATBUFFERS_USE_STD_OPTIONAL
#if FLATBUFFERS_USE_STD_OPTIONAL
#include <optional>
#endif
#ifndef FLATBUFFERS_USE_STD_SPAN
// Testing __cpp_lib_span requires including either <version> or <span>,
// both of which were added in C++20.
// See: https://en.cppreference.com/w/cpp/utility/feature_test
#if defined(__cplusplus) && __cplusplus >= 202002L
#define FLATBUFFERS_USE_STD_SPAN 1
#endif
#endif // FLATBUFFERS_USE_STD_SPAN
#if defined(FLATBUFFERS_USE_STD_SPAN)
#include <array>
#include <span>
#else
// Disable non-trivial ctors if FLATBUFFERS_SPAN_MINIMAL defined.
#if !defined(FLATBUFFERS_TEMPLATES_ALIASES)
#define FLATBUFFERS_SPAN_MINIMAL
#else
// Enable implicit construction of a span<T,N> from a std::array<T,N>.
#include <array>
#endif
#endif // defined(FLATBUFFERS_USE_STD_SPAN)
// This header provides backwards compatibility for older versions of the STL.
namespace flatbuffers {
#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
template <typename T>
using numeric_limits = std::numeric_limits<T>;
#else
template <typename T> class numeric_limits :
public std::numeric_limits<T> {};
#endif // defined(FLATBUFFERS_TEMPLATES_ALIASES)
#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
template <typename T> using is_scalar = std::is_scalar<T>;
template <typename T, typename U> using is_same = std::is_same<T,U>;
template <typename T> using is_floating_point = std::is_floating_point<T>;
template <typename T> using is_unsigned = std::is_unsigned<T>;
template <typename T> using is_enum = std::is_enum<T>;
template <typename T> using make_unsigned = std::make_unsigned<T>;
template<bool B, class T, class F>
using conditional = std::conditional<B, T, F>;
template<class T, T v>
using integral_constant = std::integral_constant<T, v>;
template <bool B>
using bool_constant = integral_constant<bool, B>;
using true_type = std::true_type;
using false_type = std::false_type;
#else
// MSVC 2010 doesn't support C++11 aliases.
template <typename T> struct is_scalar : public std::is_scalar<T> {};
template <typename T, typename U> struct is_same : public std::is_same<T,U> {};
template <typename T> struct is_floating_point :
public std::is_floating_point<T> {};
template <typename T> struct is_unsigned : public std::is_unsigned<T> {};
template <typename T> struct is_enum : public std::is_enum<T> {};
template <typename T> struct make_unsigned : public std::make_unsigned<T> {};
template<bool B, class T, class F>
struct conditional : public std::conditional<B, T, F> {};
template<class T, T v>
struct integral_constant : public std::integral_constant<T, v> {};
template <bool B>
struct bool_constant : public integral_constant<bool, B> {};
typedef bool_constant<true> true_type;
typedef bool_constant<false> false_type;
#endif // defined(FLATBUFFERS_TEMPLATES_ALIASES)
#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
template <class T> using unique_ptr = std::unique_ptr<T>;
#else
// MSVC 2010 doesn't support C++11 aliases.
// We're manually "aliasing" the class here as we want to bring unique_ptr
// into the flatbuffers namespace. We have unique_ptr in the flatbuffers
// namespace we have a completely independent implementation (see below)
// for C++98 STL implementations.
template <class T> class unique_ptr : public std::unique_ptr<T> {
public:
unique_ptr() {}
explicit unique_ptr(T* p) : std::unique_ptr<T>(p) {}
unique_ptr(std::unique_ptr<T>&& u) { *this = std::move(u); }
unique_ptr(unique_ptr&& u) { *this = std::move(u); }
unique_ptr& operator=(std::unique_ptr<T>&& u) {
std::unique_ptr<T>::reset(u.release());
return *this;
}
unique_ptr& operator=(unique_ptr&& u) {
std::unique_ptr<T>::reset(u.release());
return *this;
}
unique_ptr& operator=(T* p) {
return std::unique_ptr<T>::operator=(p);
}
};
#endif // defined(FLATBUFFERS_TEMPLATES_ALIASES)
#if FLATBUFFERS_USE_STD_OPTIONAL
template<class T>
using Optional = std::optional<T>;
using nullopt_t = std::nullopt_t;
inline constexpr nullopt_t nullopt = std::nullopt;
#else
// Limited implementation of Optional<T> type for a scalar T.
// This implementation limited by trivial types compatible with
// std::is_arithmetic<T> or std::is_enum<T> type traits.
// A tag to indicate an empty flatbuffers::optional<T>.
struct nullopt_t {
explicit FLATBUFFERS_CONSTEXPR_CPP11 nullopt_t(int) {}
};
#if defined(FLATBUFFERS_CONSTEXPR_DEFINED)
namespace internal {
template <class> struct nullopt_holder {
static constexpr nullopt_t instance_ = nullopt_t(0);
};
template<class Dummy>
constexpr nullopt_t nullopt_holder<Dummy>::instance_;
}
static constexpr const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
#else
namespace internal {
template <class> struct nullopt_holder {
static const nullopt_t instance_;
};
template<class Dummy>
const nullopt_t nullopt_holder<Dummy>::instance_ = nullopt_t(0);
}
static const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
#endif
template<class T>
class Optional FLATBUFFERS_FINAL_CLASS {
// Non-scalar 'T' would extremely complicated Optional<T>.
// Use is_scalar<T> checking because flatbuffers flatbuffers::is_arithmetic<T>
// isn't implemented.
static_assert(flatbuffers::is_scalar<T>::value, "unexpected type T");
public:
~Optional() {}
FLATBUFFERS_CONSTEXPR_CPP11 Optional() FLATBUFFERS_NOEXCEPT
: value_(), has_value_(false) {}
FLATBUFFERS_CONSTEXPR_CPP11 Optional(nullopt_t) FLATBUFFERS_NOEXCEPT
: value_(), has_value_(false) {}
FLATBUFFERS_CONSTEXPR_CPP11 Optional(T val) FLATBUFFERS_NOEXCEPT
: value_(val), has_value_(true) {}
FLATBUFFERS_CONSTEXPR_CPP11 Optional(const Optional &other) FLATBUFFERS_NOEXCEPT
: value_(other.value_), has_value_(other.has_value_) {}
FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(const Optional &other) FLATBUFFERS_NOEXCEPT {
value_ = other.value_;
has_value_ = other.has_value_;
return *this;
}
FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(nullopt_t) FLATBUFFERS_NOEXCEPT {
value_ = T();
has_value_ = false;
return *this;
}
FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(T val) FLATBUFFERS_NOEXCEPT {
value_ = val;
has_value_ = true;
return *this;
}
void reset() FLATBUFFERS_NOEXCEPT {
*this = nullopt;
}
void swap(Optional &other) FLATBUFFERS_NOEXCEPT {
std::swap(value_, other.value_);
std::swap(has_value_, other.has_value_);
}
FLATBUFFERS_CONSTEXPR_CPP11 FLATBUFFERS_EXPLICIT_CPP11 operator bool() const FLATBUFFERS_NOEXCEPT {
return has_value_;
}
FLATBUFFERS_CONSTEXPR_CPP11 bool has_value() const FLATBUFFERS_NOEXCEPT {
return has_value_;
}
FLATBUFFERS_CONSTEXPR_CPP11 const T& operator*() const FLATBUFFERS_NOEXCEPT {
return value_;
}
const T& value() const {
FLATBUFFERS_ASSERT(has_value());
return value_;
}
T value_or(T default_value) const FLATBUFFERS_NOEXCEPT {
return has_value() ? value_ : default_value;
}
private:
T value_;
bool has_value_;
};
template<class T>
FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& opt, nullopt_t) FLATBUFFERS_NOEXCEPT {
return !opt;
}
template<class T>
FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(nullopt_t, const Optional<T>& opt) FLATBUFFERS_NOEXCEPT {
return !opt;
}
template<class T, class U>
FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const U& rhs) FLATBUFFERS_NOEXCEPT {
return static_cast<bool>(lhs) && (*lhs == rhs);
}
template<class T, class U>
FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const T& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
return static_cast<bool>(rhs) && (lhs == *rhs);
}
template<class T, class U>
FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
return static_cast<bool>(lhs) != static_cast<bool>(rhs)
? false
: !static_cast<bool>(lhs) ? false : (*lhs == *rhs);
}
#endif // FLATBUFFERS_USE_STD_OPTIONAL
// Very limited and naive partial implementation of C++20 std::span<T,Extent>.
#if defined(FLATBUFFERS_USE_STD_SPAN)
inline constexpr std::size_t dynamic_extent = std::dynamic_extent;
template<class T, std::size_t Extent = std::dynamic_extent>
using span = std::span<T, Extent>;
#else // !defined(FLATBUFFERS_USE_STD_SPAN)
FLATBUFFERS_CONSTEXPR std::size_t dynamic_extent = static_cast<std::size_t>(-1);
// Exclude this code if MSVC2010 or non-STL Android is active.
// The non-STL Android doesn't have `std::is_convertible` required for SFINAE.
#if !defined(FLATBUFFERS_SPAN_MINIMAL)
namespace internal {
// This is SFINAE helper class for checking of a common condition:
// > This overload only participates in overload resolution
// > Check whether a pointer to an array of From can be converted
// > to a pointer to an array of To.
// This helper is used for checking of 'From -> const From'.
template<class To, std::size_t Extent, class From, std::size_t N>
struct is_span_convertible {
using type =
typename std::conditional<std::is_convertible<From (*)[], To (*)[]>::value
&& (Extent == dynamic_extent || N == Extent),
int, void>::type;
};
template<typename T>
struct SpanIterator {
// TODO: upgrade to std::random_access_iterator_tag.
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = typename std::remove_cv<T>::type;
using reference = T&;
using pointer = T*;
// Convince MSVC compiler that this iterator is trusted (it is verified).
#ifdef _MSC_VER
using _Unchecked_type = pointer;
#endif // _MSC_VER
SpanIterator(pointer ptr) : ptr_(ptr) {}
reference operator*() const { return *ptr_; }
pointer operator->() { return ptr_; }
SpanIterator& operator++() { ptr_++; return *this; }
SpanIterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
friend bool operator== (const SpanIterator& lhs, const SpanIterator& rhs) { return lhs.ptr_ == rhs.ptr_; }
friend bool operator!= (const SpanIterator& lhs, const SpanIterator& rhs) { return lhs.ptr_ != rhs.ptr_; }
private:
pointer ptr_;
};
} // namespace internal
#endif // !defined(FLATBUFFERS_SPAN_MINIMAL)
// T - element type; must be a complete type that is not an abstract
// class type.
// Extent - the number of elements in the sequence, or dynamic.
template<class T, std::size_t Extent = dynamic_extent>
class span FLATBUFFERS_FINAL_CLASS {
public:
typedef T element_type;
typedef T& reference;
typedef const T& const_reference;
typedef T* pointer;
typedef const T* const_pointer;
typedef std::size_t size_type;
static FLATBUFFERS_CONSTEXPR size_type extent = Extent;
// Returns the number of elements in the span.
FLATBUFFERS_CONSTEXPR_CPP11 size_type size() const FLATBUFFERS_NOEXCEPT {
return count_;
}
// Returns the size of the sequence in bytes.
FLATBUFFERS_CONSTEXPR_CPP11
size_type size_bytes() const FLATBUFFERS_NOEXCEPT {
return size() * sizeof(element_type);
}
// Checks if the span is empty.
FLATBUFFERS_CONSTEXPR_CPP11 bool empty() const FLATBUFFERS_NOEXCEPT {
return size() == 0;
}
// Returns a pointer to the beginning of the sequence.
FLATBUFFERS_CONSTEXPR_CPP11 pointer data() const FLATBUFFERS_NOEXCEPT {
return data_;
}
#if !defined(FLATBUFFERS_SPAN_MINIMAL)
using Iterator = internal::SpanIterator<T>;
Iterator begin() const { return Iterator(data()); }
Iterator end() const { return Iterator(data() + size()); }
#endif
// Returns a reference to the idx-th element of the sequence.
// The behavior is undefined if the idx is greater than or equal to size().
FLATBUFFERS_CONSTEXPR_CPP11 reference operator[](size_type idx) const {
return data()[idx];
}
FLATBUFFERS_CONSTEXPR_CPP11 span(const span &other) FLATBUFFERS_NOEXCEPT
: data_(other.data_), count_(other.count_) {}
FLATBUFFERS_CONSTEXPR_CPP14 span &operator=(const span &other)
FLATBUFFERS_NOEXCEPT {
data_ = other.data_;
count_ = other.count_;
}
// Limited implementation of
// `template <class It> constexpr std::span(It first, size_type count);`.
//
// Constructs a span that is a view over the range [first, first + count);
// the resulting span has: data() == first and size() == count.
// The behavior is undefined if [first, first + count) is not a valid range,
// or if (extent != flatbuffers::dynamic_extent && count != extent).
FLATBUFFERS_CONSTEXPR_CPP11
explicit span(pointer first, size_type count) FLATBUFFERS_NOEXCEPT
: data_ (Extent == dynamic_extent ? first : (Extent == count ? first : nullptr)),
count_(Extent == dynamic_extent ? count : (Extent == count ? Extent : 0)) {
// Make span empty if the count argument is incompatible with span<T,N>.
}
// Exclude this code if MSVC2010 is active. The MSVC2010 isn't C++11
// compliant, it doesn't support default template arguments for functions.
#if defined(FLATBUFFERS_SPAN_MINIMAL)
FLATBUFFERS_CONSTEXPR_CPP11 span() FLATBUFFERS_NOEXCEPT : data_(nullptr),
count_(0) {
static_assert(extent == 0 || extent == dynamic_extent, "invalid span");
}
#else
// Constructs an empty span whose data() == nullptr and size() == 0.
// This overload only participates in overload resolution if
// extent == 0 || extent == flatbuffers::dynamic_extent.
// A dummy template argument N is need dependency for SFINAE.
template<std::size_t N = 0,
typename internal::is_span_convertible<element_type, Extent, element_type, (N - N)>::type = 0>
FLATBUFFERS_CONSTEXPR_CPP11 span() FLATBUFFERS_NOEXCEPT : data_(nullptr),
count_(0) {
static_assert(extent == 0 || extent == dynamic_extent, "invalid span");
}
// Constructs a span that is a view over the array arr; the resulting span
// has size() == N and data() == std::data(arr). These overloads only
// participate in overload resolution if
// extent == std::dynamic_extent || N == extent is true and
// std::remove_pointer_t<decltype(std::data(arr))>(*)[]
// is convertible to element_type (*)[].
template<std::size_t N,
typename internal::is_span_convertible<element_type, Extent, element_type, N>::type = 0>
FLATBUFFERS_CONSTEXPR_CPP11 span(element_type (&arr)[N]) FLATBUFFERS_NOEXCEPT
: data_(arr), count_(N) {}
template<class U, std::size_t N,
typename internal::is_span_convertible<element_type, Extent, U, N>::type = 0>
FLATBUFFERS_CONSTEXPR_CPP11 span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
: data_(arr.data()), count_(N) {}
//template<class U, std::size_t N,
// int = 0>
//FLATBUFFERS_CONSTEXPR_CPP11 span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
// : data_(arr.data()), count_(N) {}
template<class U, std::size_t N,
typename internal::is_span_convertible<element_type, Extent, U, N>::type = 0>
FLATBUFFERS_CONSTEXPR_CPP11 span(const std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
: data_(arr.data()), count_(N) {}
// Converting constructor from another span s;
// the resulting span has size() == s.size() and data() == s.data().
// This overload only participates in overload resolution
// if extent == std::dynamic_extent || N == extent is true and U (*)[]
// is convertible to element_type (*)[].
template<class U, std::size_t N,
typename internal::is_span_convertible<element_type, Extent, U, N>::type = 0>
FLATBUFFERS_CONSTEXPR_CPP11 span(const flatbuffers::span<U, N> &s) FLATBUFFERS_NOEXCEPT
: span(s.data(), s.size()) {
}
#endif // !defined(FLATBUFFERS_SPAN_MINIMAL)
private:
// This is a naive implementation with 'count_' member even if (Extent != dynamic_extent).
pointer const data_;
size_type count_;
};
#endif // defined(FLATBUFFERS_USE_STD_SPAN)
#if !defined(FLATBUFFERS_SPAN_MINIMAL)
template<class ElementType, std::size_t Extent>
FLATBUFFERS_CONSTEXPR_CPP11
flatbuffers::span<ElementType, Extent> make_span(ElementType(&arr)[Extent]) FLATBUFFERS_NOEXCEPT {
return span<ElementType, Extent>(arr);
}
template<class ElementType, std::size_t Extent>
FLATBUFFERS_CONSTEXPR_CPP11
flatbuffers::span<const ElementType, Extent> make_span(const ElementType(&arr)[Extent]) FLATBUFFERS_NOEXCEPT {
return span<const ElementType, Extent>(arr);
}
template<class ElementType, std::size_t Extent>
FLATBUFFERS_CONSTEXPR_CPP11
flatbuffers::span<ElementType, Extent> make_span(std::array<ElementType, Extent> &arr) FLATBUFFERS_NOEXCEPT {
return span<ElementType, Extent>(arr);
}
template<class ElementType, std::size_t Extent>
FLATBUFFERS_CONSTEXPR_CPP11
flatbuffers::span<const ElementType, Extent> make_span(const std::array<ElementType, Extent> &arr) FLATBUFFERS_NOEXCEPT {
return span<const ElementType, Extent>(arr);
}
template<class ElementType, std::size_t Extent>
FLATBUFFERS_CONSTEXPR_CPP11
flatbuffers::span<ElementType, dynamic_extent> make_span(ElementType *first, std::size_t count) FLATBUFFERS_NOEXCEPT {
return span<ElementType, dynamic_extent>(first, count);
}
template<class ElementType, std::size_t Extent>
FLATBUFFERS_CONSTEXPR_CPP11
flatbuffers::span<const ElementType, dynamic_extent> make_span(const ElementType *first, std::size_t count) FLATBUFFERS_NOEXCEPT {
return span<const ElementType, dynamic_extent>(first, count);
}
#endif // !defined(FLATBUFFERS_SPAN_MINIMAL)
} // namespace flatbuffers
#endif // FLATBUFFERS_STL_EMULATION_H_

View File

@ -0,0 +1,64 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_STRING_H_
#define FLATBUFFERS_STRING_H_
#include "flatbuffers/base.h"
#include "flatbuffers/vector.h"
namespace flatbuffers {
struct String : public Vector<char> {
const char *c_str() const { return reinterpret_cast<const char *>(Data()); }
std::string str() const { return std::string(c_str(), size()); }
// clang-format off
#ifdef FLATBUFFERS_HAS_STRING_VIEW
flatbuffers::string_view string_view() const {
return flatbuffers::string_view(c_str(), size());
}
#endif // FLATBUFFERS_HAS_STRING_VIEW
// clang-format on
bool operator<(const String &o) const {
return StringLessThan(this->data(), this->size(), o.data(), o.size());
}
};
// Convenience function to get std::string from a String returning an empty
// string on null pointer.
static inline std::string GetString(const String *str) {
return str ? str->str() : "";
}
// Convenience function to get char* from a String returning an empty string on
// null pointer.
static inline const char *GetCstring(const String *str) {
return str ? str->c_str() : "";
}
#ifdef FLATBUFFERS_HAS_STRING_VIEW
// Convenience function to get string_view from a String returning an empty
// string_view on null pointer.
static inline flatbuffers::string_view GetStringView(const String *str) {
return str ? str->string_view() : flatbuffers::string_view();
}
#endif // FLATBUFFERS_HAS_STRING_VIEW
} // namespace flatbuffers
#endif // FLATBUFFERS_STRING_H_

View File

@ -0,0 +1,53 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_STRUCT_H_
#define FLATBUFFERS_STRUCT_H_
#include "flatbuffers/base.h"
namespace flatbuffers {
// "structs" are flat structures that do not have an offset table, thus
// always have all members present and do not support forwards/backwards
// compatible extensions.
class Struct FLATBUFFERS_FINAL_CLASS {
public:
template<typename T> T GetField(uoffset_t o) const {
return ReadScalar<T>(&data_[o]);
}
template<typename T> T GetStruct(uoffset_t o) const {
return reinterpret_cast<T>(&data_[o]);
}
const uint8_t *GetAddressOf(uoffset_t o) const { return &data_[o]; }
uint8_t *GetAddressOf(uoffset_t o) { return &data_[o]; }
private:
// private constructor & copy constructor: you obtain instances of this
// class by pointing to existing data only
Struct();
Struct(const Struct &);
Struct &operator=(const Struct &);
uint8_t data_[1];
};
} // namespace flatbuffers
#endif // FLATBUFFERS_STRUCT_H_

View File

@ -0,0 +1,188 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_TABLE_H_
#define FLATBUFFERS_TABLE_H_
#include "flatbuffers/base.h"
#include "flatbuffers/verifier.h"
namespace flatbuffers {
// "tables" use an offset table (possibly shared) that allows fields to be
// omitted and added at will, but uses an extra indirection to read.
class Table {
public:
const uint8_t *GetVTable() const {
return data_ - ReadScalar<soffset_t>(data_);
}
// This gets the field offset for any of the functions below it, or 0
// if the field was not present.
voffset_t GetOptionalFieldOffset(voffset_t field) const {
// The vtable offset is always at the start.
auto vtable = GetVTable();
// The first element is the size of the vtable (fields + type id + itself).
auto vtsize = ReadScalar<voffset_t>(vtable);
// If the field we're accessing is outside the vtable, we're reading older
// data, so it's the same as if the offset was 0 (not present).
return field < vtsize ? ReadScalar<voffset_t>(vtable + field) : 0;
}
template<typename T> T GetField(voffset_t field, T defaultval) const {
auto field_offset = GetOptionalFieldOffset(field);
return field_offset ? ReadScalar<T>(data_ + field_offset) : defaultval;
}
template<typename P, typename OffsetSize = uoffset_t>
P GetPointer(voffset_t field) {
auto field_offset = GetOptionalFieldOffset(field);
auto p = data_ + field_offset;
return field_offset ? reinterpret_cast<P>(p + ReadScalar<OffsetSize>(p))
: nullptr;
}
template<typename P, typename OffsetSize = uoffset_t>
P GetPointer(voffset_t field) const {
return const_cast<Table *>(this)->GetPointer<P, OffsetSize>(field);
}
template<typename P> P GetPointer64(voffset_t field) {
return GetPointer<P, uoffset64_t>(field);
}
template<typename P> P GetPointer64(voffset_t field) const {
return GetPointer<P, uoffset64_t>(field);
}
template<typename P> P GetStruct(voffset_t field) const {
auto field_offset = GetOptionalFieldOffset(field);
auto p = const_cast<uint8_t *>(data_ + field_offset);
return field_offset ? reinterpret_cast<P>(p) : nullptr;
}
template<typename Raw, typename Face>
flatbuffers::Optional<Face> GetOptional(voffset_t field) const {
auto field_offset = GetOptionalFieldOffset(field);
auto p = data_ + field_offset;
return field_offset ? Optional<Face>(static_cast<Face>(ReadScalar<Raw>(p)))
: Optional<Face>();
}
template<typename T> bool SetField(voffset_t field, T val, T def) {
auto field_offset = GetOptionalFieldOffset(field);
if (!field_offset) return IsTheSameAs(val, def);
WriteScalar(data_ + field_offset, val);
return true;
}
template<typename T> bool SetField(voffset_t field, T val) {
auto field_offset = GetOptionalFieldOffset(field);
if (!field_offset) return false;
WriteScalar(data_ + field_offset, val);
return true;
}
bool SetPointer(voffset_t field, const uint8_t *val) {
auto field_offset = GetOptionalFieldOffset(field);
if (!field_offset) return false;
WriteScalar(data_ + field_offset,
static_cast<uoffset_t>(val - (data_ + field_offset)));
return true;
}
uint8_t *GetAddressOf(voffset_t field) {
auto field_offset = GetOptionalFieldOffset(field);
return field_offset ? data_ + field_offset : nullptr;
}
const uint8_t *GetAddressOf(voffset_t field) const {
return const_cast<Table *>(this)->GetAddressOf(field);
}
bool CheckField(voffset_t field) const {
return GetOptionalFieldOffset(field) != 0;
}
// Verify the vtable of this table.
// Call this once per table, followed by VerifyField once per field.
bool VerifyTableStart(Verifier &verifier) const {
return verifier.VerifyTableStart(data_);
}
// Verify a particular field.
template<typename T>
bool VerifyField(const Verifier &verifier, voffset_t field,
size_t align) const {
// Calling GetOptionalFieldOffset should be safe now thanks to
// VerifyTable().
auto field_offset = GetOptionalFieldOffset(field);
// Check the actual field.
return !field_offset || verifier.VerifyField<T>(data_, field_offset, align);
}
// VerifyField for required fields.
template<typename T>
bool VerifyFieldRequired(const Verifier &verifier, voffset_t field,
size_t align) const {
auto field_offset = GetOptionalFieldOffset(field);
return verifier.Check(field_offset != 0) &&
verifier.VerifyField<T>(data_, field_offset, align);
}
// Versions for offsets.
template<typename OffsetT = uoffset_t>
bool VerifyOffset(const Verifier &verifier, voffset_t field) const {
auto field_offset = GetOptionalFieldOffset(field);
return !field_offset || verifier.VerifyOffset<OffsetT>(data_, field_offset);
}
template<typename OffsetT = uoffset_t>
bool VerifyOffsetRequired(const Verifier &verifier, voffset_t field) const {
auto field_offset = GetOptionalFieldOffset(field);
return verifier.Check(field_offset != 0) &&
verifier.VerifyOffset<OffsetT>(data_, field_offset);
}
bool VerifyOffset64(const Verifier &verifier, voffset_t field) const {
return VerifyOffset<uoffset64_t>(verifier, field);
}
bool VerifyOffset64Required(const Verifier &verifier, voffset_t field) const {
return VerifyOffsetRequired<uoffset64_t>(verifier, field);
}
private:
// private constructor & copy constructor: you obtain instances of this
// class by pointing to existing data only
Table();
Table(const Table &other);
Table &operator=(const Table &);
uint8_t data_[1];
};
// This specialization allows avoiding warnings like:
// MSVC C4800: type: forcing value to bool 'true' or 'false'.
template<>
inline flatbuffers::Optional<bool> Table::GetOptional<uint8_t, bool>(
voffset_t field) const {
auto field_offset = GetOptionalFieldOffset(field);
auto p = data_ + field_offset;
return field_offset ? Optional<bool>(ReadScalar<uint8_t>(p) != 0)
: Optional<bool>();
}
} // namespace flatbuffers
#endif // FLATBUFFERS_TABLE_H_

View File

@ -0,0 +1,400 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_VECTOR_H_
#define FLATBUFFERS_VECTOR_H_
#include "flatbuffers/base.h"
#include "flatbuffers/buffer.h"
#include "flatbuffers/stl_emulation.h"
namespace flatbuffers {
struct String;
// An STL compatible iterator implementation for Vector below, effectively
// calling Get() for every element.
template<typename T, typename IT, typename Data = uint8_t *,
typename SizeT = uoffset_t>
struct VectorIterator {
typedef std::random_access_iterator_tag iterator_category;
typedef IT value_type;
typedef ptrdiff_t difference_type;
typedef IT *pointer;
typedef IT &reference;
static const SizeT element_stride = IndirectHelper<T>::element_stride;
VectorIterator(Data data, SizeT i) : data_(data + element_stride * i) {}
VectorIterator(const VectorIterator &other) : data_(other.data_) {}
VectorIterator() : data_(nullptr) {}
VectorIterator &operator=(const VectorIterator &other) {
data_ = other.data_;
return *this;
}
VectorIterator &operator=(VectorIterator &&other) {
data_ = other.data_;
return *this;
}
bool operator==(const VectorIterator &other) const {
return data_ == other.data_;
}
bool operator<(const VectorIterator &other) const {
return data_ < other.data_;
}
bool operator!=(const VectorIterator &other) const {
return data_ != other.data_;
}
difference_type operator-(const VectorIterator &other) const {
return (data_ - other.data_) / element_stride;
}
// Note: return type is incompatible with the standard
// `reference operator*()`.
IT operator*() const { return IndirectHelper<T>::Read(data_, 0); }
// Note: return type is incompatible with the standard
// `pointer operator->()`.
IT operator->() const { return IndirectHelper<T>::Read(data_, 0); }
VectorIterator &operator++() {
data_ += element_stride;
return *this;
}
VectorIterator operator++(int) {
VectorIterator temp(data_, 0);
data_ += element_stride;
return temp;
}
VectorIterator operator+(const SizeT &offset) const {
return VectorIterator(data_ + offset * element_stride, 0);
}
VectorIterator &operator+=(const SizeT &offset) {
data_ += offset * element_stride;
return *this;
}
VectorIterator &operator--() {
data_ -= element_stride;
return *this;
}
VectorIterator operator--(int) {
VectorIterator temp(data_, 0);
data_ -= element_stride;
return temp;
}
VectorIterator operator-(const SizeT &offset) const {
return VectorIterator(data_ - offset * element_stride, 0);
}
VectorIterator &operator-=(const SizeT &offset) {
data_ -= offset * element_stride;
return *this;
}
private:
Data data_;
};
template<typename T, typename IT, typename SizeT = uoffset_t>
using VectorConstIterator = VectorIterator<T, IT, const uint8_t *, SizeT>;
template<typename Iterator>
struct VectorReverseIterator : public std::reverse_iterator<Iterator> {
explicit VectorReverseIterator(Iterator iter)
: std::reverse_iterator<Iterator>(iter) {}
// Note: return type is incompatible with the standard
// `reference operator*()`.
typename Iterator::value_type operator*() const {
auto tmp = std::reverse_iterator<Iterator>::current;
return *--tmp;
}
// Note: return type is incompatible with the standard
// `pointer operator->()`.
typename Iterator::value_type operator->() const {
auto tmp = std::reverse_iterator<Iterator>::current;
return *--tmp;
}
};
// This is used as a helper type for accessing vectors.
// Vector::data() assumes the vector elements start after the length field.
template<typename T, typename SizeT = uoffset_t> class Vector {
public:
typedef VectorIterator<T,
typename IndirectHelper<T>::mutable_return_type,
uint8_t *, SizeT>
iterator;
typedef VectorConstIterator<T, typename IndirectHelper<T>::return_type,
SizeT>
const_iterator;
typedef VectorReverseIterator<iterator> reverse_iterator;
typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
typedef typename flatbuffers::bool_constant<flatbuffers::is_scalar<T>::value>
scalar_tag;
static FLATBUFFERS_CONSTEXPR bool is_span_observable =
scalar_tag::value && (FLATBUFFERS_LITTLEENDIAN || sizeof(T) == 1);
SizeT size() const { return EndianScalar(length_); }
// Deprecated: use size(). Here for backwards compatibility.
FLATBUFFERS_ATTRIBUTE([[deprecated("use size() instead")]])
SizeT Length() const { return size(); }
typedef SizeT size_type;
typedef typename IndirectHelper<T>::return_type return_type;
typedef typename IndirectHelper<T>::mutable_return_type
mutable_return_type;
typedef return_type value_type;
return_type Get(SizeT i) const {
FLATBUFFERS_ASSERT(i < size());
return IndirectHelper<T>::Read(Data(), i);
}
return_type operator[](SizeT i) const { return Get(i); }
// If this is a Vector of enums, T will be its storage type, not the enum
// type. This function makes it convenient to retrieve value with enum
// type E.
template<typename E> E GetEnum(SizeT i) const {
return static_cast<E>(Get(i));
}
// If this a vector of unions, this does the cast for you. There's no check
// to make sure this is the right type!
template<typename U> const U *GetAs(SizeT i) const {
return reinterpret_cast<const U *>(Get(i));
}
// If this a vector of unions, this does the cast for you. There's no check
// to make sure this is actually a string!
const String *GetAsString(SizeT i) const {
return reinterpret_cast<const String *>(Get(i));
}
const void *GetStructFromOffset(size_t o) const {
return reinterpret_cast<const void *>(Data() + o);
}
iterator begin() { return iterator(Data(), 0); }
const_iterator begin() const { return const_iterator(Data(), 0); }
iterator end() { return iterator(Data(), size()); }
const_iterator end() const { return const_iterator(Data(), size()); }
reverse_iterator rbegin() { return reverse_iterator(end()); }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
reverse_iterator rend() { return reverse_iterator(begin()); }
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
const_iterator cbegin() const { return begin(); }
const_iterator cend() const { return end(); }
const_reverse_iterator crbegin() const { return rbegin(); }
const_reverse_iterator crend() const { return rend(); }
// Change elements if you have a non-const pointer to this object.
// Scalars only. See reflection.h, and the documentation.
void Mutate(SizeT i, const T &val) {
FLATBUFFERS_ASSERT(i < size());
WriteScalar(data() + i, val);
}
// Change an element of a vector of tables (or strings).
// "val" points to the new table/string, as you can obtain from
// e.g. reflection::AddFlatBuffer().
void MutateOffset(SizeT i, const uint8_t *val) {
FLATBUFFERS_ASSERT(i < size());
static_assert(sizeof(T) == sizeof(SizeT), "Unrelated types");
WriteScalar(data() + i,
static_cast<SizeT>(val - (Data() + i * sizeof(SizeT))));
}
// Get a mutable pointer to tables/strings inside this vector.
mutable_return_type GetMutableObject(SizeT i) const {
FLATBUFFERS_ASSERT(i < size());
return const_cast<mutable_return_type>(IndirectHelper<T>::Read(Data(), i));
}
// The raw data in little endian format. Use with care.
const uint8_t *Data() const {
return reinterpret_cast<const uint8_t *>(&length_ + 1);
}
uint8_t *Data() { return reinterpret_cast<uint8_t *>(&length_ + 1); }
// Similarly, but typed, much like std::vector::data
const T *data() const { return reinterpret_cast<const T *>(Data()); }
T *data() { return reinterpret_cast<T *>(Data()); }
template<typename K> return_type LookupByKey(K key) const {
void *search_result = std::bsearch(
&key, Data(), size(), IndirectHelper<T>::element_stride, KeyCompare<K>);
if (!search_result) {
return nullptr; // Key not found.
}
const uint8_t *element = reinterpret_cast<const uint8_t *>(search_result);
return IndirectHelper<T>::Read(element, 0);
}
template<typename K> mutable_return_type MutableLookupByKey(K key) {
return const_cast<mutable_return_type>(LookupByKey(key));
}
protected:
// This class is only used to access pre-existing data. Don't ever
// try to construct these manually.
Vector();
SizeT length_;
private:
// This class is a pointer. Copying will therefore create an invalid object.
// Private and unimplemented copy constructor.
Vector(const Vector &);
Vector &operator=(const Vector &);
template<typename K> static int KeyCompare(const void *ap, const void *bp) {
const K *key = reinterpret_cast<const K *>(ap);
const uint8_t *data = reinterpret_cast<const uint8_t *>(bp);
auto table = IndirectHelper<T>::Read(data, 0);
// std::bsearch compares with the operands transposed, so we negate the
// result here.
return -table->KeyCompareWithValue(*key);
}
};
template<typename T> using Vector64 = Vector<T, uoffset64_t>;
template<class U>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U> make_span(Vector<U> &vec)
FLATBUFFERS_NOEXCEPT {
static_assert(Vector<U>::is_span_observable,
"wrong type U, only LE-scalar, or byte types are allowed");
return span<U>(vec.data(), vec.size());
}
template<class U>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U> make_span(
const Vector<U> &vec) FLATBUFFERS_NOEXCEPT {
static_assert(Vector<U>::is_span_observable,
"wrong type U, only LE-scalar, or byte types are allowed");
return span<const U>(vec.data(), vec.size());
}
template<class U>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<uint8_t> make_bytes_span(
Vector<U> &vec) FLATBUFFERS_NOEXCEPT {
static_assert(Vector<U>::scalar_tag::value,
"wrong type U, only LE-scalar, or byte types are allowed");
return span<uint8_t>(vec.Data(), vec.size() * sizeof(U));
}
template<class U>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const uint8_t> make_bytes_span(
const Vector<U> &vec) FLATBUFFERS_NOEXCEPT {
static_assert(Vector<U>::scalar_tag::value,
"wrong type U, only LE-scalar, or byte types are allowed");
return span<const uint8_t>(vec.Data(), vec.size() * sizeof(U));
}
// Convenient helper functions to get a span of any vector, regardless
// of whether it is null or not (the field is not set).
template<class U>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U> make_span(Vector<U> *ptr)
FLATBUFFERS_NOEXCEPT {
static_assert(Vector<U>::is_span_observable,
"wrong type U, only LE-scalar, or byte types are allowed");
return ptr ? make_span(*ptr) : span<U>();
}
template<class U>
FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U> make_span(
const Vector<U> *ptr) FLATBUFFERS_NOEXCEPT {
static_assert(Vector<U>::is_span_observable,
"wrong type U, only LE-scalar, or byte types are allowed");
return ptr ? make_span(*ptr) : span<const U>();
}
// Represent a vector much like the template above, but in this case we
// don't know what the element types are (used with reflection.h).
class VectorOfAny {
public:
uoffset_t size() const { return EndianScalar(length_); }
const uint8_t *Data() const {
return reinterpret_cast<const uint8_t *>(&length_ + 1);
}
uint8_t *Data() { return reinterpret_cast<uint8_t *>(&length_ + 1); }
protected:
VectorOfAny();
uoffset_t length_;
private:
VectorOfAny(const VectorOfAny &);
VectorOfAny &operator=(const VectorOfAny &);
};
template<typename T, typename U>
Vector<Offset<T>> *VectorCast(Vector<Offset<U>> *ptr) {
static_assert(std::is_base_of<T, U>::value, "Unrelated types");
return reinterpret_cast<Vector<Offset<T>> *>(ptr);
}
template<typename T, typename U>
const Vector<Offset<T>> *VectorCast(const Vector<Offset<U>> *ptr) {
static_assert(std::is_base_of<T, U>::value, "Unrelated types");
return reinterpret_cast<const Vector<Offset<T>> *>(ptr);
}
// Convenient helper function to get the length of any vector, regardless
// of whether it is null or not (the field is not set).
template<typename T> static inline size_t VectorLength(const Vector<T> *v) {
return v ? v->size() : 0;
}
} // namespace flatbuffers
#endif // FLATBUFFERS_VERIFIER_H_

View File

@ -0,0 +1,288 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_VECTOR_DOWNWARD_H_
#define FLATBUFFERS_VECTOR_DOWNWARD_H_
#include <cstdint>
#include <algorithm>
#include "flatbuffers/base.h"
#include "flatbuffers/default_allocator.h"
#include "flatbuffers/detached_buffer.h"
namespace flatbuffers {
// This is a minimal replication of std::vector<uint8_t> functionality,
// except growing from higher to lower addresses. i.e. push_back() inserts data
// in the lowest address in the vector.
// Since this vector leaves the lower part unused, we support a "scratch-pad"
// that can be stored there for temporary data, to share the allocated space.
// Essentially, this supports 2 std::vectors in a single buffer.
template<typename SizeT = uoffset_t> class vector_downward {
public:
explicit vector_downward(size_t initial_size, Allocator *allocator,
bool own_allocator, size_t buffer_minalign,
const SizeT max_size = FLATBUFFERS_MAX_BUFFER_SIZE)
: allocator_(allocator),
own_allocator_(own_allocator),
initial_size_(initial_size),
max_size_(max_size),
buffer_minalign_(buffer_minalign),
reserved_(0),
size_(0),
buf_(nullptr),
cur_(nullptr),
scratch_(nullptr) {}
vector_downward(vector_downward &&other) noexcept
// clang-format on
: allocator_(other.allocator_),
own_allocator_(other.own_allocator_),
initial_size_(other.initial_size_),
max_size_(other.max_size_),
buffer_minalign_(other.buffer_minalign_),
reserved_(other.reserved_),
size_(other.size_),
buf_(other.buf_),
cur_(other.cur_),
scratch_(other.scratch_) {
// No change in other.allocator_
// No change in other.initial_size_
// No change in other.buffer_minalign_
other.own_allocator_ = false;
other.reserved_ = 0;
other.buf_ = nullptr;
other.cur_ = nullptr;
other.scratch_ = nullptr;
}
vector_downward &operator=(vector_downward &&other) noexcept {
// Move construct a temporary and swap idiom
vector_downward temp(std::move(other));
swap(temp);
return *this;
}
~vector_downward() {
clear_buffer();
clear_allocator();
}
void reset() {
clear_buffer();
clear();
}
void clear() {
if (buf_) {
cur_ = buf_ + reserved_;
} else {
reserved_ = 0;
cur_ = nullptr;
}
size_ = 0;
clear_scratch();
}
void clear_scratch() { scratch_ = buf_; }
void clear_allocator() {
if (own_allocator_ && allocator_) { delete allocator_; }
allocator_ = nullptr;
own_allocator_ = false;
}
void clear_buffer() {
if (buf_) Deallocate(allocator_, buf_, reserved_);
buf_ = nullptr;
}
// Relinquish the pointer to the caller.
uint8_t *release_raw(size_t &allocated_bytes, size_t &offset) {
auto *buf = buf_;
allocated_bytes = reserved_;
offset = vector_downward::offset();
// release_raw only relinquishes the buffer ownership.
// Does not deallocate or reset the allocator. Destructor will do that.
buf_ = nullptr;
clear();
return buf;
}
// Relinquish the pointer to the caller.
DetachedBuffer release() {
// allocator ownership (if any) is transferred to DetachedBuffer.
DetachedBuffer fb(allocator_, own_allocator_, buf_, reserved_, cur_,
size());
if (own_allocator_) {
allocator_ = nullptr;
own_allocator_ = false;
}
buf_ = nullptr;
clear();
return fb;
}
size_t ensure_space(size_t len) {
FLATBUFFERS_ASSERT(cur_ >= scratch_ && scratch_ >= buf_);
// If the length is larger than the unused part of the buffer, we need to
// grow.
if (len > unused_buffer_size()) { reallocate(len); }
FLATBUFFERS_ASSERT(size() < max_size_);
return len;
}
inline uint8_t *make_space(size_t len) {
if (len) {
ensure_space(len);
cur_ -= len;
size_ += static_cast<SizeT>(len);
}
return cur_;
}
// Returns nullptr if using the DefaultAllocator.
Allocator *get_custom_allocator() { return allocator_; }
// The current offset into the buffer.
size_t offset() const { return cur_ - buf_; }
// The total size of the vector (both the buffer and scratch parts).
inline SizeT size() const { return size_; }
// The size of the buffer part of the vector that is currently unused.
SizeT unused_buffer_size() const { return static_cast<SizeT>(cur_ - scratch_); }
// The size of the scratch part of the vector.
SizeT scratch_size() const { return static_cast<SizeT>(scratch_ - buf_); }
size_t capacity() const { return reserved_; }
uint8_t *data() const {
FLATBUFFERS_ASSERT(cur_);
return cur_;
}
uint8_t *scratch_data() const {
FLATBUFFERS_ASSERT(buf_);
return buf_;
}
uint8_t *scratch_end() const {
FLATBUFFERS_ASSERT(scratch_);
return scratch_;
}
uint8_t *data_at(size_t offset) const { return buf_ + reserved_ - offset; }
void push(const uint8_t *bytes, size_t num) {
if (num > 0) { memcpy(make_space(num), bytes, num); }
}
// Specialized version of push() that avoids memcpy call for small data.
template<typename T> void push_small(const T &little_endian_t) {
make_space(sizeof(T));
*reinterpret_cast<T *>(cur_) = little_endian_t;
}
template<typename T> void scratch_push_small(const T &t) {
ensure_space(sizeof(T));
*reinterpret_cast<T *>(scratch_) = t;
scratch_ += sizeof(T);
}
// fill() is most frequently called with small byte counts (<= 4),
// which is why we're using loops rather than calling memset.
void fill(size_t zero_pad_bytes) {
make_space(zero_pad_bytes);
for (size_t i = 0; i < zero_pad_bytes; i++) cur_[i] = 0;
}
// Version for when we know the size is larger.
// Precondition: zero_pad_bytes > 0
void fill_big(size_t zero_pad_bytes) {
memset(make_space(zero_pad_bytes), 0, zero_pad_bytes);
}
void pop(size_t bytes_to_remove) {
cur_ += bytes_to_remove;
size_ -= static_cast<SizeT>(bytes_to_remove);
}
void scratch_pop(size_t bytes_to_remove) { scratch_ -= bytes_to_remove; }
void swap(vector_downward &other) {
using std::swap;
swap(allocator_, other.allocator_);
swap(own_allocator_, other.own_allocator_);
swap(initial_size_, other.initial_size_);
swap(buffer_minalign_, other.buffer_minalign_);
swap(reserved_, other.reserved_);
swap(size_, other.size_);
swap(max_size_, other.max_size_);
swap(buf_, other.buf_);
swap(cur_, other.cur_);
swap(scratch_, other.scratch_);
}
void swap_allocator(vector_downward &other) {
using std::swap;
swap(allocator_, other.allocator_);
swap(own_allocator_, other.own_allocator_);
}
private:
// You shouldn't really be copying instances of this class.
FLATBUFFERS_DELETE_FUNC(vector_downward(const vector_downward &));
FLATBUFFERS_DELETE_FUNC(vector_downward &operator=(const vector_downward &));
Allocator *allocator_;
bool own_allocator_;
size_t initial_size_;
// The maximum size the vector can be.
SizeT max_size_;
size_t buffer_minalign_;
size_t reserved_;
SizeT size_;
uint8_t *buf_;
uint8_t *cur_; // Points at location between empty (below) and used (above).
uint8_t *scratch_; // Points to the end of the scratchpad in use.
void reallocate(size_t len) {
auto old_reserved = reserved_;
auto old_size = size();
auto old_scratch_size = scratch_size();
reserved_ +=
(std::max)(len, old_reserved ? old_reserved / 2 : initial_size_);
reserved_ = (reserved_ + buffer_minalign_ - 1) & ~(buffer_minalign_ - 1);
if (buf_) {
buf_ = ReallocateDownward(allocator_, buf_, old_reserved, reserved_,
old_size, old_scratch_size);
} else {
buf_ = Allocate(allocator_, reserved_);
}
cur_ = buf_ + reserved_ - old_size;
scratch_ = buf_ + old_scratch_size;
}
};
} // namespace flatbuffers
#endif // FLATBUFFERS_VECTOR_DOWNWARD_H_

View File

@ -0,0 +1,330 @@
/*
* Copyright 2021 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLATBUFFERS_VERIFIER_H_
#define FLATBUFFERS_VERIFIER_H_
#include "flatbuffers/base.h"
#include "flatbuffers/vector.h"
namespace flatbuffers {
// Helper class to verify the integrity of a FlatBuffer
class Verifier FLATBUFFERS_FINAL_CLASS {
public:
struct Options {
// The maximum nesting of tables and vectors before we call it invalid.
uoffset_t max_depth = 64;
// The maximum number of tables we will verify before we call it invalid.
uoffset_t max_tables = 1000000;
// If true, verify all data is aligned.
bool check_alignment = true;
// If true, run verifier on nested flatbuffers
bool check_nested_flatbuffers = true;
// The maximum size of a buffer.
size_t max_size = FLATBUFFERS_MAX_BUFFER_SIZE;
// Use assertions to check for errors.
bool assert = false;
};
explicit Verifier(const uint8_t *const buf, const size_t buf_len,
const Options &opts)
: buf_(buf), size_(buf_len), opts_(opts) {
FLATBUFFERS_ASSERT(size_ < opts.max_size);
}
// Deprecated API, please construct with Verifier::Options.
Verifier(const uint8_t *const buf, const size_t buf_len,
const uoffset_t max_depth = 64, const uoffset_t max_tables = 1000000,
const bool check_alignment = true)
: Verifier(buf, buf_len, [&] {
Options opts;
opts.max_depth = max_depth;
opts.max_tables = max_tables;
opts.check_alignment = check_alignment;
return opts;
}()) {}
// Central location where any verification failures register.
bool Check(const bool ok) const {
// clang-format off
#ifdef FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
if (opts_.assert) { FLATBUFFERS_ASSERT(ok); }
#endif
#ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
if (!ok)
upper_bound_ = 0;
#endif
// clang-format on
return ok;
}
// Verify any range within the buffer.
bool Verify(const size_t elem, const size_t elem_len) const {
// clang-format off
#ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
auto upper_bound = elem + elem_len;
if (upper_bound_ < upper_bound)
upper_bound_ = upper_bound;
#endif
// clang-format on
return Check(elem_len < size_ && elem <= size_ - elem_len);
}
bool VerifyAlignment(const size_t elem, const size_t align) const {
return Check((elem & (align - 1)) == 0 || !opts_.check_alignment);
}
// Verify a range indicated by sizeof(T).
template<typename T> bool Verify(const size_t elem) const {
return VerifyAlignment(elem, sizeof(T)) && Verify(elem, sizeof(T));
}
bool VerifyFromPointer(const uint8_t *const p, const size_t len) {
return Verify(static_cast<size_t>(p - buf_), len);
}
// Verify relative to a known-good base pointer.
bool VerifyFieldStruct(const uint8_t *const base, const voffset_t elem_off,
const size_t elem_len, const size_t align) const {
const auto f = static_cast<size_t>(base - buf_) + elem_off;
return VerifyAlignment(f, align) && Verify(f, elem_len);
}
template<typename T>
bool VerifyField(const uint8_t *const base, const voffset_t elem_off,
const size_t align) const {
const auto f = static_cast<size_t>(base - buf_) + elem_off;
return VerifyAlignment(f, align) && Verify(f, sizeof(T));
}
// Verify a pointer (may be NULL) of a table type.
template<typename T> bool VerifyTable(const T *const table) {
return !table || table->Verify(*this);
}
// Verify a pointer (may be NULL) of any vector type.
template<int &..., typename T, typename LenT>
bool VerifyVector(const Vector<T, LenT> *const vec) const {
return !vec || VerifyVectorOrString<LenT>(
reinterpret_cast<const uint8_t *>(vec), sizeof(T));
}
// Verify a pointer (may be NULL) of a vector to struct.
template<int &..., typename T, typename LenT>
bool VerifyVector(const Vector<const T *, LenT> *const vec) const {
return VerifyVector(reinterpret_cast<const Vector<T, LenT> *>(vec));
}
// Verify a pointer (may be NULL) to string.
bool VerifyString(const String *const str) const {
size_t end;
return !str || (VerifyVectorOrString<uoffset_t>(
reinterpret_cast<const uint8_t *>(str), 1, &end) &&
Verify(end, 1) && // Must have terminator
Check(buf_[end] == '\0')); // Terminating byte must be 0.
}
// Common code between vectors and strings.
template<typename LenT = uoffset_t>
bool VerifyVectorOrString(const uint8_t *const vec, const size_t elem_size,
size_t *const end = nullptr) const {
const auto vec_offset = static_cast<size_t>(vec - buf_);
// Check we can read the size field.
if (!Verify<LenT>(vec_offset)) return false;
// Check the whole array. If this is a string, the byte past the array must
// be 0.
const LenT size = ReadScalar<LenT>(vec);
const auto max_elems = opts_.max_size / elem_size;
if (!Check(size < max_elems))
return false; // Protect against byte_size overflowing.
const auto byte_size = sizeof(LenT) + elem_size * size;
if (end) *end = vec_offset + byte_size;
return Verify(vec_offset, byte_size);
}
// Special case for string contents, after the above has been called.
bool VerifyVectorOfStrings(const Vector<Offset<String>> *const vec) const {
if (vec) {
for (uoffset_t i = 0; i < vec->size(); i++) {
if (!VerifyString(vec->Get(i))) return false;
}
}
return true;
}
// Special case for table contents, after the above has been called.
template<typename T>
bool VerifyVectorOfTables(const Vector<Offset<T>> *const vec) {
if (vec) {
for (uoffset_t i = 0; i < vec->size(); i++) {
if (!vec->Get(i)->Verify(*this)) return false;
}
}
return true;
}
__suppress_ubsan__("unsigned-integer-overflow") bool VerifyTableStart(
const uint8_t *const table) {
// Check the vtable offset.
const auto tableo = static_cast<size_t>(table - buf_);
if (!Verify<soffset_t>(tableo)) return false;
// This offset may be signed, but doing the subtraction unsigned always
// gives the result we want.
const auto vtableo =
tableo - static_cast<size_t>(ReadScalar<soffset_t>(table));
// Check the vtable size field, then check vtable fits in its entirety.
if (!(VerifyComplexity() && Verify<voffset_t>(vtableo) &&
VerifyAlignment(ReadScalar<voffset_t>(buf_ + vtableo),
sizeof(voffset_t))))
return false;
const auto vsize = ReadScalar<voffset_t>(buf_ + vtableo);
return Check((vsize & 1) == 0) && Verify(vtableo, vsize);
}
template<typename T>
bool VerifyBufferFromStart(const char *const identifier, const size_t start) {
// Buffers have to be of some size to be valid. The reason it is a runtime
// check instead of static_assert, is that nested flatbuffers go through
// this call and their size is determined at runtime.
if (!Check(size_ >= FLATBUFFERS_MIN_BUFFER_SIZE)) return false;
// If an identifier is provided, check that we have a buffer
if (identifier && !Check((size_ >= 2 * sizeof(flatbuffers::uoffset_t) &&
BufferHasIdentifier(buf_ + start, identifier)))) {
return false;
}
// Call T::Verify, which must be in the generated code for this type.
const auto o = VerifyOffset<uoffset_t>(start);
return Check(o != 0) &&
reinterpret_cast<const T *>(buf_ + start + o)->Verify(*this)
// clang-format off
#ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
&& GetComputedSize()
#endif
;
// clang-format on
}
template<typename T, int &..., typename SizeT>
bool VerifyNestedFlatBuffer(const Vector<uint8_t, SizeT> *const buf,
const char *const identifier) {
// Caller opted out of this.
if (!opts_.check_nested_flatbuffers) return true;
// An empty buffer is OK as it indicates not present.
if (!buf) return true;
// If there is a nested buffer, it must be greater than the min size.
if (!Check(buf->size() >= FLATBUFFERS_MIN_BUFFER_SIZE)) return false;
Verifier nested_verifier(buf->data(), buf->size(), opts_);
return nested_verifier.VerifyBuffer<T>(identifier);
}
// Verify this whole buffer, starting with root type T.
template<typename T> bool VerifyBuffer() { return VerifyBuffer<T>(nullptr); }
template<typename T> bool VerifyBuffer(const char *const identifier) {
return VerifyBufferFromStart<T>(identifier, 0);
}
template<typename T, typename SizeT = uoffset_t>
bool VerifySizePrefixedBuffer(const char *const identifier) {
return Verify<SizeT>(0U) &&
Check(ReadScalar<SizeT>(buf_) == size_ - sizeof(SizeT)) &&
VerifyBufferFromStart<T>(identifier, sizeof(SizeT));
}
template<typename OffsetT = uoffset_t, typename SOffsetT = soffset_t>
size_t VerifyOffset(const size_t start) const {
if (!Verify<OffsetT>(start)) return 0;
const auto o = ReadScalar<OffsetT>(buf_ + start);
// May not point to itself.
if (!Check(o != 0)) return 0;
// Can't wrap around larger than the max size.
if (!Check(static_cast<SOffsetT>(o) >= 0)) return 0;
// Must be inside the buffer to create a pointer from it (pointer outside
// buffer is UB).
if (!Verify(start + o, 1)) return 0;
return o;
}
template<typename OffsetT = uoffset_t>
size_t VerifyOffset(const uint8_t *const base, const voffset_t start) const {
return VerifyOffset<OffsetT>(static_cast<size_t>(base - buf_) + start);
}
// Called at the start of a table to increase counters measuring data
// structure depth and amount, and possibly bails out with false if limits set
// by the constructor have been hit. Needs to be balanced with EndTable().
bool VerifyComplexity() {
depth_++;
num_tables_++;
return Check(depth_ <= opts_.max_depth && num_tables_ <= opts_.max_tables);
}
// Called at the end of a table to pop the depth count.
bool EndTable() {
depth_--;
return true;
}
// Returns the message size in bytes
size_t GetComputedSize() const {
// clang-format off
#ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
uintptr_t size = upper_bound_;
// Align the size to uoffset_t
size = (size - 1 + sizeof(uoffset_t)) & ~(sizeof(uoffset_t) - 1);
return (size > size_) ? 0 : size;
#else
// Must turn on FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE for this to work.
(void)upper_bound_;
FLATBUFFERS_ASSERT(false);
return 0;
#endif
// clang-format on
}
std::vector<uint8_t> *GetFlexReuseTracker() { return flex_reuse_tracker_; }
void SetFlexReuseTracker(std::vector<uint8_t> *const rt) {
flex_reuse_tracker_ = rt;
}
private:
const uint8_t *buf_;
const size_t size_;
const Options opts_;
mutable size_t upper_bound_ = 0;
uoffset_t depth_ = 0;
uoffset_t num_tables_ = 0;
std::vector<uint8_t> *flex_reuse_tracker_ = nullptr;
};
// Specialization for 64-bit offsets.
template<>
inline size_t Verifier::VerifyOffset<uoffset64_t>(const size_t start) const {
return VerifyOffset<uoffset64_t, soffset64_t>(start);
}
} // namespace flatbuffers
#endif // FLATBUFFERS_VERIFIER_H_

View File

@ -0,0 +1,9 @@
cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
set(HAL_LIB_NAME "")
set(RVV_HAL_FOUND TRUE CACHE INTERNAL "")
set(RVV_HAL_VERSION "0.0.1" CACHE INTERNAL "")
set(RVV_HAL_LIBRARIES ${HAL_LIB_NAME} CACHE INTERNAL "")
set(RVV_HAL_HEADERS "hal_rvv.hpp" CACHE INTERNAL "")
set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "")

View File

@ -0,0 +1,22 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
#define OPENCV_HAL_RVV_HPP_INCLUDED
#include "opencv2/core/hal/interface.h"
#ifndef CV_HAL_RVV_071_ENABLED
# if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__) && defined(__riscv_v) && __riscv_v == 7000
# define CV_HAL_RVV_071_ENABLED 1
# else
# define CV_HAL_RVV_071_ENABLED 0
# endif
#endif
#if CV_HAL_RVV_071_ENABLED
#include "version/hal_rvv_071.hpp"
#endif
#endif

View File

@ -0,0 +1,109 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_071_HPP_INCLUDED
#define OPENCV_HAL_RVV_071_HPP_INCLUDED
#include <riscv_vector.h>
#include <limits>
namespace cv { namespace cv_hal_rvv {
#undef cv_hal_cvtBGRtoBGR
#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::cvtBGRtoBGR
static const unsigned char index_array_32 [32]
{ 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31 };
static const unsigned char index_array_24 [24]
{ 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21 };
static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
{
vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
int i = 0;
for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
{
vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
vse8_v_u8m2(dst, vec_dst, vsize);
}
for ( ; i < n; i++, src += scn, dst += dcn )
{
unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
dst[2] = t0;
dst[1] = t1;
dst[0] = t2;
if(dcn == 4)
{
unsigned char d = src[3];
dst[3] = d;
}
}
}
static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
{
for (int i = 0; i < n; i++, src += scn, dst += dcn)
{
unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
dst[bi ] = t0;
dst[1] = t1;
dst[bi^2] = t2;
if(dcn == 4)
{
unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
dst[3] = d;
}
}
}
static int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
{
if (depth != CV_8U)
{
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
const int blueIdx = swapBlue ? 2 : 0;
if (scn == dcn)
{
if (!swapBlue)
{
return CV_HAL_ERROR_NOT_IMPLEMENTED;
}
const int vsize_pixels = 8;
if (scn == 4)
{
for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
{
vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
}
}
else
{
for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
{
vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
}
}
}
else
{
for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
}
return CV_HAL_ERROR_OK;
}
}}
#endif

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More