rsCpuIntrinsicConvolve5x5.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. /*
  2. * Copyright (C) 2012 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "rsCpuIntrinsic.h"
  17. #include "rsCpuIntrinsicInlines.h"
  18. namespace android {
  19. namespace renderscript {
  20. class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
  21. public:
  22. void populateScript(Script *) override;
  23. void invokeFreeChildren() override;
  24. void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
  25. void setGlobalObj(uint32_t slot, ObjectBase *data) override;
  26. ~RsdCpuScriptIntrinsicConvolve5x5() override;
  27. RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
  28. protected:
  29. float mFp[28];
  30. int16_t mIp[28];
  31. ObjectBaseRef<Allocation> alloc;
  32. static void kernelU1(const RsExpandKernelDriverInfo *info,
  33. uint32_t xstart, uint32_t xend,
  34. uint32_t outstep);
  35. static void kernelU2(const RsExpandKernelDriverInfo *info,
  36. uint32_t xstart, uint32_t xend,
  37. uint32_t outstep);
  38. static void kernelU4(const RsExpandKernelDriverInfo *info,
  39. uint32_t xstart, uint32_t xend,
  40. uint32_t outstep);
  41. static void kernelF1(const RsExpandKernelDriverInfo *info,
  42. uint32_t xstart, uint32_t xend,
  43. uint32_t outstep);
  44. static void kernelF2(const RsExpandKernelDriverInfo *info,
  45. uint32_t xstart, uint32_t xend,
  46. uint32_t outstep);
  47. static void kernelF4(const RsExpandKernelDriverInfo *info,
  48. uint32_t xstart, uint32_t xend,
  49. uint32_t outstep);
  50. };
  51. void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
  52. rsAssert(slot == 1);
  53. alloc.set(static_cast<Allocation *>(data));
  54. }
  55. void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
  56. const void *data, size_t dataLength) {
  57. rsAssert(slot == 0);
  58. memcpy (&mFp, data, dataLength);
  59. for(int ct=0; ct < 25; ct++) {
  60. if (mFp[ct] >= 0) {
  61. mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
  62. } else {
  63. mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
  64. }
  65. }
  66. }
  67. static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
  68. const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
  69. const float* coeff) {
  70. uint32_t x0 = rsMax((int32_t)x-2, 0);
  71. uint32_t x1 = rsMax((int32_t)x-1, 0);
  72. uint32_t x2 = x;
  73. uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
  74. uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
  75. float4 px = convert_float4(py0[x0]) * coeff[0] +
  76. convert_float4(py0[x1]) * coeff[1] +
  77. convert_float4(py0[x2]) * coeff[2] +
  78. convert_float4(py0[x3]) * coeff[3] +
  79. convert_float4(py0[x4]) * coeff[4] +
  80. convert_float4(py1[x0]) * coeff[5] +
  81. convert_float4(py1[x1]) * coeff[6] +
  82. convert_float4(py1[x2]) * coeff[7] +
  83. convert_float4(py1[x3]) * coeff[8] +
  84. convert_float4(py1[x4]) * coeff[9] +
  85. convert_float4(py2[x0]) * coeff[10] +
  86. convert_float4(py2[x1]) * coeff[11] +
  87. convert_float4(py2[x2]) * coeff[12] +
  88. convert_float4(py2[x3]) * coeff[13] +
  89. convert_float4(py2[x4]) * coeff[14] +
  90. convert_float4(py3[x0]) * coeff[15] +
  91. convert_float4(py3[x1]) * coeff[16] +
  92. convert_float4(py3[x2]) * coeff[17] +
  93. convert_float4(py3[x3]) * coeff[18] +
  94. convert_float4(py3[x4]) * coeff[19] +
  95. convert_float4(py4[x0]) * coeff[20] +
  96. convert_float4(py4[x1]) * coeff[21] +
  97. convert_float4(py4[x2]) * coeff[22] +
  98. convert_float4(py4[x3]) * coeff[23] +
  99. convert_float4(py4[x4]) * coeff[24];
  100. px = clamp(px + 0.5f, 0.f, 255.f);
  101. *out = convert_uchar4(px);
  102. }
  103. static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
  104. const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
  105. const float* coeff) {
  106. uint32_t x0 = rsMax((int32_t)x-2, 0);
  107. uint32_t x1 = rsMax((int32_t)x-1, 0);
  108. uint32_t x2 = x;
  109. uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
  110. uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
  111. float2 px = convert_float2(py0[x0]) * coeff[0] +
  112. convert_float2(py0[x1]) * coeff[1] +
  113. convert_float2(py0[x2]) * coeff[2] +
  114. convert_float2(py0[x3]) * coeff[3] +
  115. convert_float2(py0[x4]) * coeff[4] +
  116. convert_float2(py1[x0]) * coeff[5] +
  117. convert_float2(py1[x1]) * coeff[6] +
  118. convert_float2(py1[x2]) * coeff[7] +
  119. convert_float2(py1[x3]) * coeff[8] +
  120. convert_float2(py1[x4]) * coeff[9] +
  121. convert_float2(py2[x0]) * coeff[10] +
  122. convert_float2(py2[x1]) * coeff[11] +
  123. convert_float2(py2[x2]) * coeff[12] +
  124. convert_float2(py2[x3]) * coeff[13] +
  125. convert_float2(py2[x4]) * coeff[14] +
  126. convert_float2(py3[x0]) * coeff[15] +
  127. convert_float2(py3[x1]) * coeff[16] +
  128. convert_float2(py3[x2]) * coeff[17] +
  129. convert_float2(py3[x3]) * coeff[18] +
  130. convert_float2(py3[x4]) * coeff[19] +
  131. convert_float2(py4[x0]) * coeff[20] +
  132. convert_float2(py4[x1]) * coeff[21] +
  133. convert_float2(py4[x2]) * coeff[22] +
  134. convert_float2(py4[x3]) * coeff[23] +
  135. convert_float2(py4[x4]) * coeff[24];
  136. px = clamp(px + 0.5f, 0.f, 255.f);
  137. *out = convert_uchar2(px);
  138. }
  139. static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
  140. const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
  141. const float* coeff) {
  142. uint32_t x0 = rsMax((int32_t)x-2, 0);
  143. uint32_t x1 = rsMax((int32_t)x-1, 0);
  144. uint32_t x2 = x;
  145. uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
  146. uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
  147. float px = (float)(py0[x0]) * coeff[0] +
  148. (float)(py0[x1]) * coeff[1] +
  149. (float)(py0[x2]) * coeff[2] +
  150. (float)(py0[x3]) * coeff[3] +
  151. (float)(py0[x4]) * coeff[4] +
  152. (float)(py1[x0]) * coeff[5] +
  153. (float)(py1[x1]) * coeff[6] +
  154. (float)(py1[x2]) * coeff[7] +
  155. (float)(py1[x3]) * coeff[8] +
  156. (float)(py1[x4]) * coeff[9] +
  157. (float)(py2[x0]) * coeff[10] +
  158. (float)(py2[x1]) * coeff[11] +
  159. (float)(py2[x2]) * coeff[12] +
  160. (float)(py2[x3]) * coeff[13] +
  161. (float)(py2[x4]) * coeff[14] +
  162. (float)(py3[x0]) * coeff[15] +
  163. (float)(py3[x1]) * coeff[16] +
  164. (float)(py3[x2]) * coeff[17] +
  165. (float)(py3[x3]) * coeff[18] +
  166. (float)(py3[x4]) * coeff[19] +
  167. (float)(py4[x0]) * coeff[20] +
  168. (float)(py4[x1]) * coeff[21] +
  169. (float)(py4[x2]) * coeff[22] +
  170. (float)(py4[x3]) * coeff[23] +
  171. (float)(py4[x4]) * coeff[24];
  172. px = clamp(px + 0.5f, 0.f, 255.f);
  173. *out = px;
  174. }
  175. static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
  176. const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
  177. const float* coeff) {
  178. uint32_t x0 = rsMax((int32_t)x-2, 0);
  179. uint32_t x1 = rsMax((int32_t)x-1, 0);
  180. uint32_t x2 = x;
  181. uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
  182. uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
  183. float4 px = py0[x0] * coeff[0] +
  184. py0[x1] * coeff[1] +
  185. py0[x2] * coeff[2] +
  186. py0[x3] * coeff[3] +
  187. py0[x4] * coeff[4] +
  188. py1[x0] * coeff[5] +
  189. py1[x1] * coeff[6] +
  190. py1[x2] * coeff[7] +
  191. py1[x3] * coeff[8] +
  192. py1[x4] * coeff[9] +
  193. py2[x0] * coeff[10] +
  194. py2[x1] * coeff[11] +
  195. py2[x2] * coeff[12] +
  196. py2[x3] * coeff[13] +
  197. py2[x4] * coeff[14] +
  198. py3[x0] * coeff[15] +
  199. py3[x1] * coeff[16] +
  200. py3[x2] * coeff[17] +
  201. py3[x3] * coeff[18] +
  202. py3[x4] * coeff[19] +
  203. py4[x0] * coeff[20] +
  204. py4[x1] * coeff[21] +
  205. py4[x2] * coeff[22] +
  206. py4[x3] * coeff[23] +
  207. py4[x4] * coeff[24];
  208. *out = px;
  209. }
  210. static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
  211. const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
  212. const float* coeff) {
  213. uint32_t x0 = rsMax((int32_t)x-2, 0);
  214. uint32_t x1 = rsMax((int32_t)x-1, 0);
  215. uint32_t x2 = x;
  216. uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
  217. uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
  218. float2 px = py0[x0] * coeff[0] +
  219. py0[x1] * coeff[1] +
  220. py0[x2] * coeff[2] +
  221. py0[x3] * coeff[3] +
  222. py0[x4] * coeff[4] +
  223. py1[x0] * coeff[5] +
  224. py1[x1] * coeff[6] +
  225. py1[x2] * coeff[7] +
  226. py1[x3] * coeff[8] +
  227. py1[x4] * coeff[9] +
  228. py2[x0] * coeff[10] +
  229. py2[x1] * coeff[11] +
  230. py2[x2] * coeff[12] +
  231. py2[x3] * coeff[13] +
  232. py2[x4] * coeff[14] +
  233. py3[x0] * coeff[15] +
  234. py3[x1] * coeff[16] +
  235. py3[x2] * coeff[17] +
  236. py3[x3] * coeff[18] +
  237. py3[x4] * coeff[19] +
  238. py4[x0] * coeff[20] +
  239. py4[x1] * coeff[21] +
  240. py4[x2] * coeff[22] +
  241. py4[x3] * coeff[23] +
  242. py4[x4] * coeff[24];
  243. *out = px;
  244. }
  245. static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
  246. const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
  247. const float* coeff) {
  248. uint32_t x0 = rsMax((int32_t)x-2, 0);
  249. uint32_t x1 = rsMax((int32_t)x-1, 0);
  250. uint32_t x2 = x;
  251. uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
  252. uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
  253. float px = py0[x0] * coeff[0] +
  254. py0[x1] * coeff[1] +
  255. py0[x2] * coeff[2] +
  256. py0[x3] * coeff[3] +
  257. py0[x4] * coeff[4] +
  258. py1[x0] * coeff[5] +
  259. py1[x1] * coeff[6] +
  260. py1[x2] * coeff[7] +
  261. py1[x3] * coeff[8] +
  262. py1[x4] * coeff[9] +
  263. py2[x0] * coeff[10] +
  264. py2[x1] * coeff[11] +
  265. py2[x2] * coeff[12] +
  266. py2[x3] * coeff[13] +
  267. py2[x4] * coeff[14] +
  268. py3[x0] * coeff[15] +
  269. py3[x1] * coeff[16] +
  270. py3[x2] * coeff[17] +
  271. py3[x3] * coeff[18] +
  272. py3[x4] * coeff[19] +
  273. py4[x0] * coeff[20] +
  274. py4[x1] * coeff[21] +
  275. py4[x2] * coeff[22] +
  276. py4[x3] * coeff[23] +
  277. py4[x4] * coeff[24];
  278. *out = px;
  279. }
  280. extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
  281. const void *y2, const void *y3, const void *y4,
  282. const int16_t *coef, uint32_t count);
  283. void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
  284. uint32_t xstart, uint32_t xend,
  285. uint32_t outstep) {
  286. RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
  287. if (!cp->alloc.get()) {
  288. ALOGE("Convolve5x5 executed without input, skipping");
  289. return;
  290. }
  291. const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
  292. const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
  293. uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
  294. uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
  295. uint32_t y2 = info->current.y;
  296. uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
  297. uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
  298. const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
  299. const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
  300. const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
  301. const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
  302. const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
  303. uchar4 *out = (uchar4 *)info->outPtr[0];
  304. uint32_t x1 = xstart;
  305. uint32_t x2 = xend;
  306. while((x1 < x2) && (x1 < 2)) {
  307. OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  308. out++;
  309. x1++;
  310. }
  311. #if defined(ARCH_X86_HAVE_SSSE3)
  312. // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
  313. // 3 for end boundary where x may hit the end boundary)
  314. if (gArchUseSIMD &&((x1 + 6) < x2)) {
  315. // subtract 3 for end boundary
  316. uint32_t len = (x2 - x1 - 3) >> 2;
  317. rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
  318. out += len << 2;
  319. x1 += len << 2;
  320. }
  321. #endif
  322. #if defined(ARCH_ARM_USE_INTRINSICS)
  323. if(gArchUseSIMD && ((x1 + 3) < x2)) {
  324. uint32_t len = (x2 - x1 - 3) >> 1;
  325. rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
  326. out += len << 1;
  327. x1 += len << 1;
  328. }
  329. #endif
  330. while(x1 < x2) {
  331. OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  332. out++;
  333. x1++;
  334. }
  335. }
  336. void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
  337. uint32_t xstart, uint32_t xend,
  338. uint32_t outstep) {
  339. RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
  340. if (!cp->alloc.get()) {
  341. ALOGE("Convolve5x5 executed without input, skipping");
  342. return;
  343. }
  344. const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
  345. const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
  346. uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
  347. uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
  348. uint32_t y2 = info->current.y;
  349. uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
  350. uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
  351. const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
  352. const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
  353. const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
  354. const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
  355. const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
  356. uchar2 *out = (uchar2 *)info->outPtr[0];
  357. uint32_t x1 = xstart;
  358. uint32_t x2 = xend;
  359. while((x1 < x2) && (x1 < 2)) {
  360. OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  361. out++;
  362. x1++;
  363. }
  364. #if 0//defined(ARCH_ARM_HAVE_NEON)
  365. if((x1 + 3) < x2) {
  366. uint32_t len = (x2 - x1 - 3) >> 1;
  367. rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
  368. out += len << 1;
  369. x1 += len << 1;
  370. }
  371. #endif
  372. while(x1 < x2) {
  373. OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  374. out++;
  375. x1++;
  376. }
  377. }
  378. void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
  379. uint32_t xstart, uint32_t xend,
  380. uint32_t outstep) {
  381. RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
  382. if (!cp->alloc.get()) {
  383. ALOGE("Convolve5x5 executed without input, skipping");
  384. return;
  385. }
  386. const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
  387. const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
  388. uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
  389. uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
  390. uint32_t y2 = info->current.y;
  391. uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
  392. uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
  393. const uchar *py0 = (const uchar *)(pin + stride * y0);
  394. const uchar *py1 = (const uchar *)(pin + stride * y1);
  395. const uchar *py2 = (const uchar *)(pin + stride * y2);
  396. const uchar *py3 = (const uchar *)(pin + stride * y3);
  397. const uchar *py4 = (const uchar *)(pin + stride * y4);
  398. uchar *out = (uchar *)info->outPtr[0];
  399. uint32_t x1 = xstart;
  400. uint32_t x2 = xend;
  401. while((x1 < x2) && (x1 < 2)) {
  402. OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  403. out++;
  404. x1++;
  405. }
  406. #if 0//defined(ARCH_ARM_HAVE_NEON)
  407. if((x1 + 3) < x2) {
  408. uint32_t len = (x2 - x1 - 3) >> 1;
  409. rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
  410. out += len << 1;
  411. x1 += len << 1;
  412. }
  413. #endif
  414. while(x1 < x2) {
  415. OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  416. out++;
  417. x1++;
  418. }
  419. }
  420. void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
  421. uint32_t xstart, uint32_t xend,
  422. uint32_t outstep) {
  423. RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
  424. if (!cp->alloc.get()) {
  425. ALOGE("Convolve5x5 executed without input, skipping");
  426. return;
  427. }
  428. const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
  429. const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
  430. uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
  431. uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
  432. uint32_t y2 = info->current.y;
  433. uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
  434. uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
  435. const float4 *py0 = (const float4 *)(pin + stride * y0);
  436. const float4 *py1 = (const float4 *)(pin + stride * y1);
  437. const float4 *py2 = (const float4 *)(pin + stride * y2);
  438. const float4 *py3 = (const float4 *)(pin + stride * y3);
  439. const float4 *py4 = (const float4 *)(pin + stride * y4);
  440. float4 *out = (float4 *)info->outPtr[0];
  441. uint32_t x1 = xstart;
  442. uint32_t x2 = xend;
  443. while((x1 < x2) && (x1 < 2)) {
  444. OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  445. out++;
  446. x1++;
  447. }
  448. #if 0//defined(ARCH_ARM_HAVE_NEON)
  449. if((x1 + 3) < x2) {
  450. uint32_t len = (x2 - x1 - 3) >> 1;
  451. rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
  452. out += len << 1;
  453. x1 += len << 1;
  454. }
  455. #endif
  456. while(x1 < x2) {
  457. OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  458. out++;
  459. x1++;
  460. }
  461. }
  462. void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
  463. uint32_t xstart, uint32_t xend,
  464. uint32_t outstep) {
  465. RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
  466. if (!cp->alloc.get()) {
  467. ALOGE("Convolve5x5 executed without input, skipping");
  468. return;
  469. }
  470. const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
  471. const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
  472. uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
  473. uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
  474. uint32_t y2 = info->current.y;
  475. uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
  476. uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
  477. const float2 *py0 = (const float2 *)(pin + stride * y0);
  478. const float2 *py1 = (const float2 *)(pin + stride * y1);
  479. const float2 *py2 = (const float2 *)(pin + stride * y2);
  480. const float2 *py3 = (const float2 *)(pin + stride * y3);
  481. const float2 *py4 = (const float2 *)(pin + stride * y4);
  482. float2 *out = (float2 *)info->outPtr[0];
  483. uint32_t x1 = xstart;
  484. uint32_t x2 = xend;
  485. while((x1 < x2) && (x1 < 2)) {
  486. OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  487. out++;
  488. x1++;
  489. }
  490. #if 0//defined(ARCH_ARM_HAVE_NEON)
  491. if((x1 + 3) < x2) {
  492. uint32_t len = (x2 - x1 - 3) >> 1;
  493. rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
  494. out += len << 1;
  495. x1 += len << 1;
  496. }
  497. #endif
  498. while(x1 < x2) {
  499. OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  500. out++;
  501. x1++;
  502. }
  503. }
  504. void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
  505. uint32_t xstart, uint32_t xend,
  506. uint32_t outstep) {
  507. RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
  508. if (!cp->alloc.get()) {
  509. ALOGE("Convolve5x5 executed without input, skipping");
  510. return;
  511. }
  512. const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
  513. const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
  514. uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
  515. uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
  516. uint32_t y2 = info->current.y;
  517. uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
  518. uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
  519. const float *py0 = (const float *)(pin + stride * y0);
  520. const float *py1 = (const float *)(pin + stride * y1);
  521. const float *py2 = (const float *)(pin + stride * y2);
  522. const float *py3 = (const float *)(pin + stride * y3);
  523. const float *py4 = (const float *)(pin + stride * y4);
  524. float *out = (float *)info->outPtr[0];
  525. uint32_t x1 = xstart;
  526. uint32_t x2 = xend;
  527. while((x1 < x2) && (x1 < 2)) {
  528. OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  529. out++;
  530. x1++;
  531. }
  532. #if 0//defined(ARCH_ARM_HAVE_NEON)
  533. if((x1 + 3) < x2) {
  534. uint32_t len = (x2 - x1 - 3) >> 1;
  535. rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
  536. out += len << 1;
  537. x1 += len << 1;
  538. }
  539. #endif
  540. while(x1 < x2) {
  541. OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
  542. out++;
  543. x1++;
  544. }
  545. }
  546. RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
  547. RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
  548. : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
  549. if (e->getType() == RS_TYPE_FLOAT_32) {
  550. switch(e->getVectorSize()) {
  551. case 1:
  552. mRootPtr = &kernelF1;
  553. break;
  554. case 2:
  555. mRootPtr = &kernelF2;
  556. break;
  557. case 3:
  558. case 4:
  559. mRootPtr = &kernelF4;
  560. break;
  561. }
  562. } else {
  563. switch(e->getVectorSize()) {
  564. case 1:
  565. mRootPtr = &kernelU1;
  566. break;
  567. case 2:
  568. mRootPtr = &kernelU2;
  569. break;
  570. case 3:
  571. case 4:
  572. mRootPtr = &kernelU4;
  573. break;
  574. }
  575. }
  576. for(int ct=0; ct < 25; ct++) {
  577. mFp[ct] = 1.f / 25.f;
  578. mIp[ct] = (int16_t)(mFp[ct] * 256.f);
  579. }
  580. }
  581. RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
  582. }
  583. void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
  584. s->mHal.info.exportedVariableCount = 2;
  585. }
  586. void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
  587. alloc.clear();
  588. }
  589. RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
  590. const Script *s, const Element *e) {
  591. return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
  592. }
  593. } // namespace renderscript
  594. } // namespace android