loongson_intrinsics.h 96 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949
  1. /*
  2. * Copyright 2022 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
  11. #define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
  12. /*
  13. * Copyright (c) 2022 Loongson Technology Corporation Limited
  14. * All rights reserved.
  15. * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
  16. * Xiwei Gu <guxiwei-hf@loongson.cn>
  17. * Lu Wang <wanglu@loongson.cn>
  18. *
  19. * This file is a header file for loongarch builtin extension.
  20. *
  21. */
  22. #ifndef LOONGSON_INTRINSICS_H
  23. #define LOONGSON_INTRINSICS_H
  24. /**
  25. * MAJOR version: Macro usage changes.
  26. * MINOR version: Add new functions, or bug fixes.
  27. * MICRO version: Comment changes or implementation changes.
  28. */
  29. #define LSOM_VERSION_MAJOR 1
  30. #define LSOM_VERSION_MINOR 1
  31. #define LSOM_VERSION_MICRO 0
  32. #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
  33. { \
  34. _OUT0 = _INS(_IN0); \
  35. _OUT1 = _INS(_IN1); \
  36. }
  37. #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
  38. { \
  39. _OUT0 = _INS(_IN0, _IN1); \
  40. _OUT1 = _INS(_IN2, _IN3); \
  41. }
  42. #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
  43. { \
  44. _OUT0 = _INS(_IN0, _IN1, _IN2); \
  45. _OUT1 = _INS(_IN3, _IN4, _IN5); \
  46. }
  47. #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
  48. { \
  49. DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
  50. DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
  51. }
  52. #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
  53. _OUT1, _OUT2, _OUT3) \
  54. { \
  55. DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
  56. DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
  57. }
  58. #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
  59. _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
  60. { \
  61. DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
  62. DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
  63. }
  64. #ifdef __loongarch_sx
  65. #include <lsxintrin.h>
  66. /*
  67. * =============================================================================
  68. * Description : Dot product & addition of byte vector elements
  69. * Arguments : Inputs - in_c, in_h, in_l
  70. * Outputs - out
  71. * Return Type - halfword
  72. * Details : Signed byte elements from in_h are multiplied by
  73. * signed byte elements from in_l, and then added adjacent to
  74. * each other to get results with the twice size of input.
  75. * Then the results plus to signed half-word elements from in_c.
  76. * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  77. * in_c : 1,2,3,4, 1,2,3,4
  78. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  79. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
  80. * out : 23,40,41,26, 23,40,41,26
  81. * =============================================================================
  82. */
  83. static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
  84. __m128i in_h,
  85. __m128i in_l) {
  86. __m128i out;
  87. out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
  88. out = __lsx_vmaddwod_h_b(out, in_h, in_l);
  89. return out;
  90. }
  91. /*
  92. * =============================================================================
  93. * Description : Dot product & addition of byte vector elements
  94. * Arguments : Inputs - in_c, in_h, in_l
  95. * Outputs - out
  96. * Return Type - halfword
  97. * Details : Unsigned byte elements from in_h are multiplied by
  98. * unsigned byte elements from in_l, and then added adjacent to
  99. * each other to get results with the twice size of input.
  100. * The results plus to signed half-word elements from in_c.
  101. * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
  102. * in_c : 1,2,3,4, 1,2,3,4
  103. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  104. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
  105. * out : 23,40,41,26, 23,40,41,26
  106. * =============================================================================
  107. */
  108. static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
  109. __m128i in_h,
  110. __m128i in_l) {
  111. __m128i out;
  112. out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
  113. out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
  114. return out;
  115. }
  116. /*
  117. * =============================================================================
  118. * Description : Dot product & addition of byte vector elements
  119. * Arguments : Inputs - in_c, in_h, in_l
  120. * Outputs - out
  121. * Return Type - halfword
  122. * Details : Unsigned byte elements from in_h are multiplied by
  123. * signed byte elements from in_l, and then added adjacent to
  124. * each other to get results with the twice size of input.
  125. * The results plus to signed half-word elements from in_c.
  126. * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
  127. * in_c : 1,1,1,1, 1,1,1,1
  128. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  129. * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
  130. * out : -4,-24,-60,-112, 6,26,62,114
  131. * =============================================================================
  132. */
  133. static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
  134. __m128i in_h,
  135. __m128i in_l) {
  136. __m128i out;
  137. out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
  138. out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
  139. return out;
  140. }
  141. /*
  142. * =============================================================================
  143. * Description : Dot product & addition of half-word vector elements
  144. * Arguments : Inputs - in_c, in_h, in_l
  145. * Outputs - out
  146. * Return Type - __m128i
  147. * Details : Signed half-word elements from in_h are multiplied by
  148. * signed half-word elements from in_l, and then added adjacent to
  149. * each other to get results with the twice size of input.
  150. * Then the results plus to signed word elements from in_c.
  151. * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  152. * in_c : 1,2,3,4
  153. * in_h : 1,2,3,4, 5,6,7,8
  154. * in_l : 8,7,6,5, 4,3,2,1
  155. * out : 23,40,41,26
  156. * =============================================================================
  157. */
  158. static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
  159. __m128i in_h,
  160. __m128i in_l) {
  161. __m128i out;
  162. out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
  163. out = __lsx_vmaddwod_w_h(out, in_h, in_l);
  164. return out;
  165. }
  166. /*
  167. * =============================================================================
  168. * Description : Dot product of byte vector elements
  169. * Arguments : Inputs - in_h, in_l
  170. * Outputs - out
  171. * Return Type - halfword
  172. * Details : Signed byte elements from in_h are multiplied by
  173. * signed byte elements from in_l, and then added adjacent to
  174. * each other to get results with the twice size of input.
  175. * Example : out = __lsx_vdp2_h_b(in_h, in_l)
  176. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  177. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
  178. * out : 22,38,38,22, 22,38,38,22
  179. * =============================================================================
  180. */
  181. static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
  182. __m128i out;
  183. out = __lsx_vmulwev_h_b(in_h, in_l);
  184. out = __lsx_vmaddwod_h_b(out, in_h, in_l);
  185. return out;
  186. }
  187. /*
  188. * =============================================================================
  189. * Description : Dot product of byte vector elements
  190. * Arguments : Inputs - in_h, in_l
  191. * Outputs - out
  192. * Return Type - halfword
  193. * Details : Unsigned byte elements from in_h are multiplied by
  194. * unsigned byte elements from in_l, and then added adjacent to
  195. * each other to get results with the twice size of input.
  196. * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
  197. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  198. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
  199. * out : 22,38,38,22, 22,38,38,22
  200. * =============================================================================
  201. */
  202. static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
  203. __m128i out;
  204. out = __lsx_vmulwev_h_bu(in_h, in_l);
  205. out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
  206. return out;
  207. }
  208. /*
  209. * =============================================================================
  210. * Description : Dot product of byte vector elements
  211. * Arguments : Inputs - in_h, in_l
  212. * Outputs - out
  213. * Return Type - halfword
  214. * Details : Unsigned byte elements from in_h are multiplied by
  215. * signed byte elements from in_l, and then added adjacent to
  216. * each other to get results with the twice size of input.
  217. * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
  218. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  219. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
  220. * out : 22,38,38,22, 22,38,38,6
  221. * =============================================================================
  222. */
  223. static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
  224. __m128i out;
  225. out = __lsx_vmulwev_h_bu_b(in_h, in_l);
  226. out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
  227. return out;
  228. }
  229. /*
  230. * =============================================================================
  231. * Description : Dot product of byte vector elements
  232. * Arguments : Inputs - in_h, in_l
  233. * Outputs - out
  234. * Return Type - halfword
  235. * Details : Signed byte elements from in_h are multiplied by
  236. * signed byte elements from in_l, and then added adjacent to
  237. * each other to get results with the twice size of input.
  238. * Example : out = __lsx_vdp2_w_h(in_h, in_l)
  239. * in_h : 1,2,3,4, 5,6,7,8
  240. * in_l : 8,7,6,5, 4,3,2,1
  241. * out : 22,38,38,22
  242. * =============================================================================
  243. */
  244. static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
  245. __m128i out;
  246. out = __lsx_vmulwev_w_h(in_h, in_l);
  247. out = __lsx_vmaddwod_w_h(out, in_h, in_l);
  248. return out;
  249. }
  250. /*
  251. * =============================================================================
  252. * Description : Clip all halfword elements of input vector between min & max
  253. * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
  254. * (_in))
  255. * Arguments : Inputs - _in (input vector)
  256. * - min (min threshold)
  257. * - max (max threshold)
  258. * Outputs - out (output vector with clipped elements)
  259. * Return Type - signed halfword
  260. * Example : out = __lsx_vclip_h(_in)
  261. * _in : -8,2,280,249, -8,255,280,249
  262. * min : 1,1,1,1, 1,1,1,1
  263. * max : 9,9,9,9, 9,9,9,9
  264. * out : 1,2,9,9, 1,9,9,9
  265. * =============================================================================
  266. */
  267. static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
  268. __m128i out;
  269. out = __lsx_vmax_h(min, _in);
  270. out = __lsx_vmin_h(max, out);
  271. return out;
  272. }
  273. /*
  274. * =============================================================================
  275. * Description : Set each element of vector between 0 and 255
  276. * Arguments : Inputs - _in
  277. * Outputs - out
  278. * Return Type - halfword
  279. * Details : Signed byte elements from _in are clamped between 0 and 255.
  280. * Example : out = __lsx_vclip255_h(_in)
  281. * _in : -8,255,280,249, -8,255,280,249
  282. * out : 0,255,255,249, 0,255,255,249
  283. * =============================================================================
  284. */
  285. static inline __m128i __lsx_vclip255_h(__m128i _in) {
  286. __m128i out;
  287. out = __lsx_vmaxi_h(_in, 0);
  288. out = __lsx_vsat_hu(out, 7);
  289. return out;
  290. }
  291. /*
  292. * =============================================================================
  293. * Description : Set each element of vector between 0 and 255
  294. * Arguments : Inputs - _in
  295. * Outputs - out
  296. * Return Type - word
  297. * Details : Signed byte elements from _in are clamped between 0 and 255.
  298. * Example : out = __lsx_vclip255_w(_in)
  299. * _in : -8,255,280,249
  300. * out : 0,255,255,249
  301. * =============================================================================
  302. */
  303. static inline __m128i __lsx_vclip255_w(__m128i _in) {
  304. __m128i out;
  305. out = __lsx_vmaxi_w(_in, 0);
  306. out = __lsx_vsat_wu(out, 7);
  307. return out;
  308. }
  309. /*
  310. * =============================================================================
  311. * Description : Swap two variables
  312. * Arguments : Inputs - _in0, _in1
  313. * Outputs - _in0, _in1 (in-place)
  314. * Details : Swapping of two input variables using xor
  315. * Example : LSX_SWAP(_in0, _in1)
  316. * _in0 : 1,2,3,4
  317. * _in1 : 5,6,7,8
  318. * _in0(out) : 5,6,7,8
  319. * _in1(out) : 1,2,3,4
  320. * =============================================================================
  321. */
  322. #define LSX_SWAP(_in0, _in1) \
  323. { \
  324. _in0 = __lsx_vxor_v(_in0, _in1); \
  325. _in1 = __lsx_vxor_v(_in0, _in1); \
  326. _in0 = __lsx_vxor_v(_in0, _in1); \
  327. }
  328. /*
  329. * =============================================================================
  330. * Description : Transpose 4x4 block with word elements in vectors
  331. * Arguments : Inputs - in0, in1, in2, in3
  332. * Outputs - out0, out1, out2, out3
  333. * Details :
  334. * Example :
  335. * 1, 2, 3, 4 1, 5, 9,13
  336. * 5, 6, 7, 8 to 2, 6,10,14
  337. * 9,10,11,12 =====> 3, 7,11,15
  338. * 13,14,15,16 4, 8,12,16
  339. * =============================================================================
  340. */
  341. #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  342. { \
  343. __m128i _t0, _t1, _t2, _t3; \
  344. \
  345. _t0 = __lsx_vilvl_w(_in1, _in0); \
  346. _t1 = __lsx_vilvh_w(_in1, _in0); \
  347. _t2 = __lsx_vilvl_w(_in3, _in2); \
  348. _t3 = __lsx_vilvh_w(_in3, _in2); \
  349. _out0 = __lsx_vilvl_d(_t2, _t0); \
  350. _out1 = __lsx_vilvh_d(_t2, _t0); \
  351. _out2 = __lsx_vilvl_d(_t3, _t1); \
  352. _out3 = __lsx_vilvh_d(_t3, _t1); \
  353. }
  354. /*
  355. * =============================================================================
  356. * Description : Transpose 8x8 block with byte elements in vectors
  357. * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
  358. * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
  359. * _out7
  360. * Details : The rows of the matrix become columns, and the columns
  361. * become rows.
  362. * Example : LSX_TRANSPOSE8x8_B
  363. * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
  364. * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
  365. * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
  366. * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
  367. * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
  368. * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
  369. * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
  370. * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
  371. *
  372. * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
  373. * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
  374. * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
  375. * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
  376. * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
  377. * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
  378. * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
  379. * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
  380. * =============================================================================
  381. */
  382. #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  383. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  384. _out7) \
  385. { \
  386. __m128i zero = {0}; \
  387. __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
  388. __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
  389. \
  390. _t0 = __lsx_vilvl_b(_in2, _in0); \
  391. _t1 = __lsx_vilvl_b(_in3, _in1); \
  392. _t2 = __lsx_vilvl_b(_in6, _in4); \
  393. _t3 = __lsx_vilvl_b(_in7, _in5); \
  394. _t4 = __lsx_vilvl_b(_t1, _t0); \
  395. _t5 = __lsx_vilvh_b(_t1, _t0); \
  396. _t6 = __lsx_vilvl_b(_t3, _t2); \
  397. _t7 = __lsx_vilvh_b(_t3, _t2); \
  398. _out0 = __lsx_vilvl_w(_t6, _t4); \
  399. _out2 = __lsx_vilvh_w(_t6, _t4); \
  400. _out4 = __lsx_vilvl_w(_t7, _t5); \
  401. _out6 = __lsx_vilvh_w(_t7, _t5); \
  402. _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
  403. _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
  404. _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
  405. _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
  406. }
  407. /*
  408. * =============================================================================
  409. * Description : Transpose 8x8 block with half-word elements in vectors
  410. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  411. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  412. * Details :
  413. * Example :
  414. * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
  415. * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
  416. * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
  417. * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
  418. * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
  419. * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
  420. * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
  421. * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
  422. * =============================================================================
  423. */
  424. #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  425. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  426. _out7) \
  427. { \
  428. __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
  429. \
  430. _s0 = __lsx_vilvl_h(_in6, _in4); \
  431. _s1 = __lsx_vilvl_h(_in7, _in5); \
  432. _t0 = __lsx_vilvl_h(_s1, _s0); \
  433. _t1 = __lsx_vilvh_h(_s1, _s0); \
  434. _s0 = __lsx_vilvh_h(_in6, _in4); \
  435. _s1 = __lsx_vilvh_h(_in7, _in5); \
  436. _t2 = __lsx_vilvl_h(_s1, _s0); \
  437. _t3 = __lsx_vilvh_h(_s1, _s0); \
  438. _s0 = __lsx_vilvl_h(_in2, _in0); \
  439. _s1 = __lsx_vilvl_h(_in3, _in1); \
  440. _t4 = __lsx_vilvl_h(_s1, _s0); \
  441. _t5 = __lsx_vilvh_h(_s1, _s0); \
  442. _s0 = __lsx_vilvh_h(_in2, _in0); \
  443. _s1 = __lsx_vilvh_h(_in3, _in1); \
  444. _t6 = __lsx_vilvl_h(_s1, _s0); \
  445. _t7 = __lsx_vilvh_h(_s1, _s0); \
  446. \
  447. _out0 = __lsx_vpickev_d(_t0, _t4); \
  448. _out2 = __lsx_vpickev_d(_t1, _t5); \
  449. _out4 = __lsx_vpickev_d(_t2, _t6); \
  450. _out6 = __lsx_vpickev_d(_t3, _t7); \
  451. _out1 = __lsx_vpickod_d(_t0, _t4); \
  452. _out3 = __lsx_vpickod_d(_t1, _t5); \
  453. _out5 = __lsx_vpickod_d(_t2, _t6); \
  454. _out7 = __lsx_vpickod_d(_t3, _t7); \
  455. }
  456. /*
  457. * =============================================================================
  458. * Description : Transpose input 8x4 byte block into 4x8
  459. * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
  460. * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
  461. * Return Type - as per RTYPE
  462. * Details : The rows of the matrix become columns, and the columns become
  463. * rows.
  464. * Example : LSX_TRANSPOSE8x4_B
  465. * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
  466. * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
  467. * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
  468. * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
  469. * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
  470. * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
  471. * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
  472. * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
  473. *
  474. * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
  475. * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
  476. * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
  477. * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
  478. * =============================================================================
  479. */
  480. #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  481. _out0, _out1, _out2, _out3) \
  482. { \
  483. __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
  484. \
  485. _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
  486. _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
  487. _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
  488. _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
  489. _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
  490. \
  491. _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
  492. _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
  493. _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
  494. \
  495. _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
  496. _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
  497. _out1 = __lsx_vilvh_d(_out2, _out0); \
  498. _out3 = __lsx_vilvh_d(_out0, _out2); \
  499. }
  500. /*
  501. * =============================================================================
  502. * Description : Transpose 16x8 block with byte elements in vectors
  503. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
  504. * in9, in10, in11, in12, in13, in14, in15
  505. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  506. * Details :
  507. * Example :
  508. * 000,001,002,003,004,005,006,007
  509. * 008,009,010,011,012,013,014,015
  510. * 016,017,018,019,020,021,022,023
  511. * 024,025,026,027,028,029,030,031
  512. * 032,033,034,035,036,037,038,039
  513. * 040,041,042,043,044,045,046,047 000,008,...,112,120
  514. * 048,049,050,051,052,053,054,055 001,009,...,113,121
  515. * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
  516. * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
  517. * 072,073,074,075,076,077,078,079 004,012,...,116,124
  518. * 080,081,082,083,084,085,086,087 005,013,...,117,125
  519. * 088,089,090,091,092,093,094,095 006,014,...,118,126
  520. * 096,097,098,099,100,101,102,103 007,015,...,119,127
  521. * 104,105,106,107,108,109,110,111
  522. * 112,113,114,115,116,117,118,119
  523. * 120,121,122,123,124,125,126,127
  524. * =============================================================================
  525. */
  526. #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  527. _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
  528. _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
  529. _out6, _out7) \
  530. { \
  531. __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
  532. __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
  533. DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
  534. _tmp0, _tmp1, _tmp2, _tmp3); \
  535. DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
  536. _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
  537. DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
  538. DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
  539. DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
  540. DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
  541. DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
  542. DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
  543. DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
  544. DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
  545. DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
  546. DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
  547. DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
  548. DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
  549. }
  550. /*
  551. * =============================================================================
  552. * Description : Butterfly of 4 input vectors
  553. * Arguments : Inputs - in0, in1, in2, in3
  554. * Outputs - out0, out1, out2, out3
  555. * Details : Butterfly operation
  556. * Example :
  557. * out0 = in0 + in3;
  558. * out1 = in1 + in2;
  559. * out2 = in1 - in2;
  560. * out3 = in0 - in3;
  561. * =============================================================================
  562. */
  563. #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  564. { \
  565. _out0 = __lsx_vadd_b(_in0, _in3); \
  566. _out1 = __lsx_vadd_b(_in1, _in2); \
  567. _out2 = __lsx_vsub_b(_in1, _in2); \
  568. _out3 = __lsx_vsub_b(_in0, _in3); \
  569. }
  570. #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  571. { \
  572. _out0 = __lsx_vadd_h(_in0, _in3); \
  573. _out1 = __lsx_vadd_h(_in1, _in2); \
  574. _out2 = __lsx_vsub_h(_in1, _in2); \
  575. _out3 = __lsx_vsub_h(_in0, _in3); \
  576. }
  577. #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  578. { \
  579. _out0 = __lsx_vadd_w(_in0, _in3); \
  580. _out1 = __lsx_vadd_w(_in1, _in2); \
  581. _out2 = __lsx_vsub_w(_in1, _in2); \
  582. _out3 = __lsx_vsub_w(_in0, _in3); \
  583. }
  584. #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  585. { \
  586. _out0 = __lsx_vadd_d(_in0, _in3); \
  587. _out1 = __lsx_vadd_d(_in1, _in2); \
  588. _out2 = __lsx_vsub_d(_in1, _in2); \
  589. _out3 = __lsx_vsub_d(_in0, _in3); \
  590. }
  591. /*
  592. * =============================================================================
  593. * Description : Butterfly of 8 input vectors
  594. * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
  595. * Outputs - _out0, _out1, _out2, _out3, ~
  596. * Details : Butterfly operation
  597. * Example :
  598. * _out0 = _in0 + _in7;
  599. * _out1 = _in1 + _in6;
  600. * _out2 = _in2 + _in5;
  601. * _out3 = _in3 + _in4;
  602. * _out4 = _in3 - _in4;
  603. * _out5 = _in2 - _in5;
  604. * _out6 = _in1 - _in6;
  605. * _out7 = _in0 - _in7;
  606. * =============================================================================
  607. */
  608. #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  609. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  610. _out7) \
  611. { \
  612. _out0 = __lsx_vadd_b(_in0, _in7); \
  613. _out1 = __lsx_vadd_b(_in1, _in6); \
  614. _out2 = __lsx_vadd_b(_in2, _in5); \
  615. _out3 = __lsx_vadd_b(_in3, _in4); \
  616. _out4 = __lsx_vsub_b(_in3, _in4); \
  617. _out5 = __lsx_vsub_b(_in2, _in5); \
  618. _out6 = __lsx_vsub_b(_in1, _in6); \
  619. _out7 = __lsx_vsub_b(_in0, _in7); \
  620. }
  621. #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  622. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  623. _out7) \
  624. { \
  625. _out0 = __lsx_vadd_h(_in0, _in7); \
  626. _out1 = __lsx_vadd_h(_in1, _in6); \
  627. _out2 = __lsx_vadd_h(_in2, _in5); \
  628. _out3 = __lsx_vadd_h(_in3, _in4); \
  629. _out4 = __lsx_vsub_h(_in3, _in4); \
  630. _out5 = __lsx_vsub_h(_in2, _in5); \
  631. _out6 = __lsx_vsub_h(_in1, _in6); \
  632. _out7 = __lsx_vsub_h(_in0, _in7); \
  633. }
  634. #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  635. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  636. _out7) \
  637. { \
  638. _out0 = __lsx_vadd_w(_in0, _in7); \
  639. _out1 = __lsx_vadd_w(_in1, _in6); \
  640. _out2 = __lsx_vadd_w(_in2, _in5); \
  641. _out3 = __lsx_vadd_w(_in3, _in4); \
  642. _out4 = __lsx_vsub_w(_in3, _in4); \
  643. _out5 = __lsx_vsub_w(_in2, _in5); \
  644. _out6 = __lsx_vsub_w(_in1, _in6); \
  645. _out7 = __lsx_vsub_w(_in0, _in7); \
  646. }
  647. #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  648. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  649. _out7) \
  650. { \
  651. _out0 = __lsx_vadd_d(_in0, _in7); \
  652. _out1 = __lsx_vadd_d(_in1, _in6); \
  653. _out2 = __lsx_vadd_d(_in2, _in5); \
  654. _out3 = __lsx_vadd_d(_in3, _in4); \
  655. _out4 = __lsx_vsub_d(_in3, _in4); \
  656. _out5 = __lsx_vsub_d(_in2, _in5); \
  657. _out6 = __lsx_vsub_d(_in1, _in6); \
  658. _out7 = __lsx_vsub_d(_in0, _in7); \
  659. }
  660. #endif // LSX
  661. #ifdef __loongarch_asx
  662. #include <lasxintrin.h>
  663. /*
  664. * =============================================================================
  665. * Description : Dot product of byte vector elements
  666. * Arguments : Inputs - in_h, in_l
  667. * Output - out
  668. * Return Type - signed halfword
  669. * Details : Unsigned byte elements from in_h are multiplied with
  670. * unsigned byte elements from in_l producing a result
  671. * twice the size of input i.e. signed halfword.
  672. * Then this multiplied results of adjacent odd-even elements
  673. * are added to the out vector
  674. * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
  675. * =============================================================================
  676. */
  677. static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
  678. __m256i out;
  679. out = __lasx_xvmulwev_h_bu(in_h, in_l);
  680. out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
  681. return out;
  682. }
  683. /*
  684. * =============================================================================
  685. * Description : Dot product of byte vector elements
  686. * Arguments : Inputs - in_h, in_l
  687. * Output - out
  688. * Return Type - signed halfword
  689. * Details : Signed byte elements from in_h are multiplied with
  690. * signed byte elements from in_l producing a result
  691. * twice the size of input i.e. signed halfword.
  692. * Then this multiplication results of adjacent odd-even elements
  693. * are added to the out vector
  694. * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
  695. * =============================================================================
  696. */
  697. static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
  698. __m256i out;
  699. out = __lasx_xvmulwev_h_b(in_h, in_l);
  700. out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
  701. return out;
  702. }
  703. /*
  704. * =============================================================================
  705. * Description : Dot product of halfword vector elements
  706. * Arguments : Inputs - in_h, in_l
  707. * Output - out
  708. * Return Type - signed word
  709. * Details : Signed halfword elements from in_h are multiplied with
  710. * signed halfword elements from in_l producing a result
  711. * twice the size of input i.e. signed word.
  712. * Then this multiplied results of adjacent odd-even elements
  713. * are added to the out vector.
  714. * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
  715. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  716. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
  717. * out : 22,38,38,22, 22,38,38,22
  718. * =============================================================================
  719. */
  720. static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
  721. __m256i out;
  722. out = __lasx_xvmulwev_w_h(in_h, in_l);
  723. out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
  724. return out;
  725. }
  726. /*
  727. * =============================================================================
  728. * Description : Dot product of word vector elements
  729. * Arguments : Inputs - in_h, in_l
  730. * Output - out
  731. * Return Type - signed double
  732. * Details : Signed word elements from in_h are multiplied with
  733. * signed word elements from in_l producing a result
  734. * twice the size of input i.e. signed double-word.
  735. * Then this multiplied results of adjacent odd-even elements
  736. * are added to the out vector.
  737. * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
  738. * =============================================================================
  739. */
  740. static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
  741. __m256i out;
  742. out = __lasx_xvmulwev_d_w(in_h, in_l);
  743. out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
  744. return out;
  745. }
  746. /*
  747. * =============================================================================
  748. * Description : Dot product of halfword vector elements
  749. * Arguments : Inputs - in_h, in_l
  750. * Output - out
  751. * Return Type - signed word
  752. * Details : Unsigned halfword elements from in_h are multiplied with
  753. * signed halfword elements from in_l producing a result
  754. * twice the size of input i.e. unsigned word.
  755. * Multiplication result of adjacent odd-even elements
  756. * are added to the out vector
  757. * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
  758. * =============================================================================
  759. */
  760. static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
  761. __m256i out;
  762. out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
  763. out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
  764. return out;
  765. }
  766. /*
  767. * =============================================================================
  768. * Description : Dot product & addition of byte vector elements
  769. * Arguments : Inputs - in_h, in_l
  770. * Output - out
  771. * Return Type - halfword
  772. * Details : Signed byte elements from in_h are multiplied with
  773. * signed byte elements from in_l producing a result
  774. * twice the size of input i.e. signed halfword.
  775. * Then this multiplied results of adjacent odd-even elements
  776. * are added to the in_c vector.
  777. * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  778. * =============================================================================
  779. */
  780. static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
  781. __m256i in_h,
  782. __m256i in_l) {
  783. __m256i out;
  784. out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
  785. out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
  786. return out;
  787. }
  788. /*
  789. * =============================================================================
  790. * Description : Dot product & addition of byte vector elements
  791. * Arguments : Inputs - in_h, in_l
  792. * Output - out
  793. * Return Type - halfword
  794. * Details : Unsigned byte elements from in_h are multiplied with
  795. * unsigned byte elements from in_l producing a result
  796. * twice the size of input i.e. signed halfword.
  797. * Then this multiplied results of adjacent odd-even elements
  798. * are added to the in_c vector.
  799. * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  800. * =============================================================================
  801. */
  802. static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
  803. __m256i in_h,
  804. __m256i in_l) {
  805. __m256i out;
  806. out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
  807. out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
  808. return out;
  809. }
  810. /*
  811. * =============================================================================
  812. * Description : Dot product & addition of byte vector elements
  813. * Arguments : Inputs - in_h, in_l
  814. * Output - out
  815. * Return Type - halfword
  816. * Details : Unsigned byte elements from in_h are multiplied with
  817. * signed byte elements from in_l producing a result
  818. * twice the size of input i.e. signed halfword.
  819. * Then this multiplied results of adjacent odd-even elements
  820. * are added to the in_c vector.
  821. * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  822. * =============================================================================
  823. */
  824. static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
  825. __m256i in_h,
  826. __m256i in_l) {
  827. __m256i out;
  828. out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
  829. out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
  830. return out;
  831. }
  832. /*
  833. * =============================================================================
  834. * Description : Dot product of halfword vector elements
  835. * Arguments : Inputs - in_c, in_h, in_l
  836. * Output - out
  837. * Return Type - per RTYPE
  838. * Details : Signed halfword elements from in_h are multiplied with
  839. * signed halfword elements from in_l producing a result
  840. * twice the size of input i.e. signed word.
  841. * Multiplication result of adjacent odd-even elements
  842. * are added to the in_c vector.
  843. * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  844. * in_c : 1,2,3,4, 1,2,3,4
  845. * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
  846. * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
  847. * out : 23,40,41,26, 23,40,41,26
  848. * =============================================================================
  849. */
  850. static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
  851. __m256i in_h,
  852. __m256i in_l) {
  853. __m256i out;
  854. out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
  855. out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
  856. return out;
  857. }
  858. /*
  859. * =============================================================================
  860. * Description : Dot product of halfword vector elements
  861. * Arguments : Inputs - in_c, in_h, in_l
  862. * Output - out
  863. * Return Type - signed word
  864. * Details : Unsigned halfword elements from in_h are multiplied with
  865. * unsigned halfword elements from in_l producing a result
  866. * twice the size of input i.e. signed word.
  867. * Multiplication result of adjacent odd-even elements
  868. * are added to the in_c vector.
  869. * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  870. * =============================================================================
  871. */
  872. static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
  873. __m256i in_h,
  874. __m256i in_l) {
  875. __m256i out;
  876. out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
  877. out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
  878. return out;
  879. }
  880. /*
  881. * =============================================================================
  882. * Description : Dot product of halfword vector elements
  883. * Arguments : Inputs - in_c, in_h, in_l
  884. * Output - out
  885. * Return Type - signed word
  886. * Details : Unsigned halfword elements from in_h are multiplied with
  887. * signed halfword elements from in_l producing a result
  888. * twice the size of input i.e. signed word.
  889. * Multiplication result of adjacent odd-even elements
  890. * are added to the in_c vector
  891. * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  892. * =============================================================================
  893. */
  894. static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
  895. __m256i in_h,
  896. __m256i in_l) {
  897. __m256i out;
  898. out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
  899. out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
  900. return out;
  901. }
  902. /*
  903. * =============================================================================
  904. * Description : Vector Unsigned Dot Product and Subtract
  905. * Arguments : Inputs - in_c, in_h, in_l
  906. * Output - out
  907. * Return Type - signed halfword
  908. * Details : Unsigned byte elements from in_h are multiplied with
  909. * unsigned byte elements from in_l producing a result
  910. * twice the size of input i.e. signed halfword.
  911. * Multiplication result of adjacent odd-even elements
  912. * are added together and subtracted from double width elements
  913. * in_c vector.
  914. * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
  915. * =============================================================================
  916. */
  917. static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
  918. __m256i in_h,
  919. __m256i in_l) {
  920. __m256i out;
  921. out = __lasx_xvmulwev_h_bu(in_h, in_l);
  922. out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
  923. out = __lasx_xvsub_h(in_c, out);
  924. return out;
  925. }
  926. /*
  927. * =============================================================================
  928. * Description : Vector Signed Dot Product and Subtract
  929. * Arguments : Inputs - in_c, in_h, in_l
  930. * Output - out
  931. * Return Type - signed word
  932. * Details : Signed halfword elements from in_h are multiplied with
  933. * Signed halfword elements from in_l producing a result
  934. * twice the size of input i.e. signed word.
  935. * Multiplication result of adjacent odd-even elements
  936. * are added together and subtracted from double width elements
  937. * in_c vector.
  938. * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
  939. * in_c : 0,0,0,0, 0,0,0,0
  940. * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
  941. * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
  942. * out : -7,-3,0,0, 0,-1,0,-1
  943. * =============================================================================
  944. */
  945. static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
  946. __m256i in_h,
  947. __m256i in_l) {
  948. __m256i out;
  949. out = __lasx_xvmulwev_w_h(in_h, in_l);
  950. out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
  951. out = __lasx_xvsub_w(in_c, out);
  952. return out;
  953. }
  954. /*
  955. * =============================================================================
  956. * Description : Dot product of halfword vector elements
  957. * Arguments : Inputs - in_h, in_l
  958. * Output - out
  959. * Return Type - signed word
  960. * Details : Signed halfword elements from in_h are multiplied with
  961. * signed halfword elements from in_l producing a result
  962. * four times the size of input i.e. signed doubleword.
  963. * Then this multiplication results of four adjacent elements
  964. * are added together and stored to the out vector.
  965. * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
  966. * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
  967. * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
  968. * out : -2,0,1,1
  969. * =============================================================================
  970. */
  971. static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
  972. __m256i out;
  973. out = __lasx_xvmulwev_w_h(in_h, in_l);
  974. out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
  975. out = __lasx_xvhaddw_d_w(out, out);
  976. return out;
  977. }
  978. /*
  979. * =============================================================================
  980. * Description : The high half of the vector elements are expanded and
  981. * added after being doubled.
  982. * Arguments : Inputs - in_h, in_l
  983. * Output - out
  984. * Details : The in_h vector and the in_l vector are added after the
  985. * higher half of the two-fold sign extension (signed byte
  986. * to signed halfword) and stored to the out vector.
  987. * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
  988. * =============================================================================
  989. */
  990. static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
  991. __m256i out;
  992. out = __lasx_xvilvh_b(in_h, in_l);
  993. out = __lasx_xvhaddw_h_b(out, out);
  994. return out;
  995. }
  996. /*
  997. * =============================================================================
  998. * Description : The high half of the vector elements are expanded and
  999. * added after being doubled.
  1000. * Arguments : Inputs - in_h, in_l
  1001. * Output - out
  1002. * Details : The in_h vector and the in_l vector are added after the
  1003. * higher half of the two-fold sign extension (signed halfword
  1004. * to signed word) and stored to the out vector.
  1005. * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
  1006. * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
  1007. * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
  1008. * out : 1,0,0,-1, 1,0,0, 2
  1009. * =============================================================================
  1010. */
  1011. static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
  1012. __m256i out;
  1013. out = __lasx_xvilvh_h(in_h, in_l);
  1014. out = __lasx_xvhaddw_w_h(out, out);
  1015. return out;
  1016. }
  1017. /*
  1018. * =============================================================================
  1019. * Description : The low half of the vector elements are expanded and
  1020. * added after being doubled.
  1021. * Arguments : Inputs - in_h, in_l
  1022. * Output - out
  1023. * Details : The in_h vector and the in_l vector are added after the
  1024. * lower half of the two-fold sign extension (signed byte
  1025. * to signed halfword) and stored to the out vector.
  1026. * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
  1027. * =============================================================================
  1028. */
  1029. static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
  1030. __m256i out;
  1031. out = __lasx_xvilvl_b(in_h, in_l);
  1032. out = __lasx_xvhaddw_h_b(out, out);
  1033. return out;
  1034. }
  1035. /*
  1036. * =============================================================================
  1037. * Description : The low half of the vector elements are expanded and
  1038. * added after being doubled.
  1039. * Arguments : Inputs - in_h, in_l
  1040. * Output - out
  1041. * Details : The in_h vector and the in_l vector are added after the
  1042. * lower half of the two-fold sign extension (signed halfword
  1043. * to signed word) and stored to the out vector.
  1044. * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
  1045. * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
  1046. * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
  1047. * out : 5,-1,4,2, 1,0,2,-1
  1048. * =============================================================================
  1049. */
  1050. static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
  1051. __m256i out;
  1052. out = __lasx_xvilvl_h(in_h, in_l);
  1053. out = __lasx_xvhaddw_w_h(out, out);
  1054. return out;
  1055. }
  1056. /*
  1057. * =============================================================================
  1058. * Description : The low half of the vector elements are expanded and
  1059. * added after being doubled.
  1060. * Arguments : Inputs - in_h, in_l
  1061. * Output - out
  1062. * Details : The out vector and the out vector are added after the
  1063. * lower half of the two-fold zero extension (unsigned byte
  1064. * to unsigned halfword) and stored to the out vector.
  1065. * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
  1066. * =============================================================================
  1067. */
  1068. static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
  1069. __m256i out;
  1070. out = __lasx_xvilvl_b(in_h, in_l);
  1071. out = __lasx_xvhaddw_hu_bu(out, out);
  1072. return out;
  1073. }
  1074. /*
  1075. * =============================================================================
  1076. * Description : The low half of the vector elements are expanded and
  1077. * added after being doubled.
  1078. * Arguments : Inputs - in_h, in_l
  1079. * Output - out
  1080. * Details : The in_l vector after double zero extension (unsigned byte to
  1081. * signed halfword),added to the in_h vector.
  1082. * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
  1083. * =============================================================================
  1084. */
  1085. static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
  1086. __m256i out;
  1087. out = __lasx_xvsllwil_hu_bu(in_l, 0);
  1088. out = __lasx_xvadd_h(in_h, out);
  1089. return out;
  1090. }
  1091. /*
  1092. * =============================================================================
  1093. * Description : The low half of the vector elements are expanded and
  1094. * added after being doubled.
  1095. * Arguments : Inputs - in_h, in_l
  1096. * Output - out
  1097. * Details : The in_l vector after double sign extension (signed halfword to
  1098. * signed word), added to the in_h vector.
  1099. * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
  1100. * in_h : 0, 1,0,0, -1,0,0,1,
  1101. * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
  1102. * out : 2, 0,1,2, -1,0,1,1,
  1103. * =============================================================================
  1104. */
  1105. static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
  1106. __m256i out;
  1107. out = __lasx_xvsllwil_w_h(in_l, 0);
  1108. out = __lasx_xvadd_w(in_h, out);
  1109. return out;
  1110. }
  1111. /*
  1112. * =============================================================================
  1113. * Description : Multiplication and addition calculation after expansion
  1114. * of the lower half of the vector.
  1115. * Arguments : Inputs - in_c, in_h, in_l
  1116. * Output - out
  1117. * Details : The in_h vector and the in_l vector are multiplied after
  1118. * the lower half of the two-fold sign extension (signed halfword
  1119. * to signed word), and the result is added to the vector in_c,
  1120. * then stored to the out vector.
  1121. * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
  1122. * in_c : 1,2,3,4, 5,6,7,8
  1123. * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
  1124. * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
  1125. * -200,-300,-400,-500, -2000,-3000,-4000,-5000
  1126. * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
  1127. * =============================================================================
  1128. */
  1129. static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
  1130. __m256i in_h,
  1131. __m256i in_l) {
  1132. __m256i tmp0, tmp1, out;
  1133. tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
  1134. tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
  1135. tmp0 = __lasx_xvmul_w(tmp0, tmp1);
  1136. out = __lasx_xvadd_w(tmp0, in_c);
  1137. return out;
  1138. }
  1139. /*
  1140. * =============================================================================
  1141. * Description : Multiplication and addition calculation after expansion
  1142. * of the higher half of the vector.
  1143. * Arguments : Inputs - in_c, in_h, in_l
  1144. * Output - out
  1145. * Details : The in_h vector and the in_l vector are multiplied after
  1146. * the higher half of the two-fold sign extension (signed
  1147. * halfword to signed word), and the result is added to
  1148. * the vector in_c, then stored to the out vector.
  1149. * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
  1150. * =============================================================================
  1151. */
  1152. static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
  1153. __m256i in_h,
  1154. __m256i in_l) {
  1155. __m256i tmp0, tmp1, out;
  1156. tmp0 = __lasx_xvilvh_h(in_h, in_h);
  1157. tmp1 = __lasx_xvilvh_h(in_l, in_l);
  1158. tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
  1159. out = __lasx_xvadd_w(tmp0, in_c);
  1160. return out;
  1161. }
  1162. /*
  1163. * =============================================================================
  1164. * Description : Multiplication calculation after expansion of the lower
  1165. * half of the vector.
  1166. * Arguments : Inputs - in_h, in_l
  1167. * Output - out
  1168. * Details : The in_h vector and the in_l vector are multiplied after
  1169. * the lower half of the two-fold sign extension (signed
  1170. * halfword to signed word), then stored to the out vector.
  1171. * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
  1172. * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
  1173. * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
  1174. * out : 6,1,3,0, 0,0,1,0
  1175. * =============================================================================
  1176. */
  1177. static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
  1178. __m256i tmp0, tmp1, out;
  1179. tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
  1180. tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
  1181. out = __lasx_xvmul_w(tmp0, tmp1);
  1182. return out;
  1183. }
  1184. /*
  1185. * =============================================================================
  1186. * Description : Multiplication calculation after expansion of the lower
  1187. * half of the vector.
  1188. * Arguments : Inputs - in_h, in_l
  1189. * Output - out
  1190. * Details : The in_h vector and the in_l vector are multiplied after
  1191. * the lower half of the two-fold sign extension (signed
  1192. * halfword to signed word), then stored to the out vector.
  1193. * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
  1194. * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
  1195. * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
  1196. * out : 0,0,0,0, 0,0,0,1
  1197. * =============================================================================
  1198. */
  1199. static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
  1200. __m256i tmp0, tmp1, out;
  1201. tmp0 = __lasx_xvilvh_h(in_h, in_h);
  1202. tmp1 = __lasx_xvilvh_h(in_l, in_l);
  1203. out = __lasx_xvmulwev_w_h(tmp0, tmp1);
  1204. return out;
  1205. }
  1206. /*
  1207. * =============================================================================
  1208. * Description : The low half of the vector elements are added to the high half
  1209. * after being doubled, then saturated.
  1210. * Arguments : Inputs - in_h, in_l
  1211. * Output - out
  1212. * Details : The in_h vector adds the in_l vector after the lower half of
  1213. * the two-fold zero extension (unsigned byte to unsigned
  1214. * halfword) and then saturated. The results are stored to the out
  1215. * vector.
  1216. * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
  1217. * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
  1218. * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
  1219. * 0,0,0,1
  1220. * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
  1221. * =============================================================================
  1222. */
  1223. static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
  1224. __m256i tmp1, out;
  1225. __m256i zero = {0};
  1226. tmp1 = __lasx_xvilvl_b(zero, in_l);
  1227. out = __lasx_xvsadd_hu(in_h, tmp1);
  1228. return out;
  1229. }
  1230. /*
  1231. * =============================================================================
  1232. * Description : Clip all halfword elements of input vector between min & max
  1233. * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
  1234. * Arguments : Inputs - in (input vector)
  1235. * - min (min threshold)
  1236. * - max (max threshold)
  1237. * Outputs - in (output vector with clipped elements)
  1238. * Return Type - signed halfword
  1239. * Example : out = __lasx_xvclip_h(in, min, max)
  1240. * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
  1241. * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
  1242. * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
  1243. * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
  1244. * =============================================================================
  1245. */
  1246. static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
  1247. __m256i out;
  1248. out = __lasx_xvmax_h(min, in);
  1249. out = __lasx_xvmin_h(max, out);
  1250. return out;
  1251. }
  1252. /*
  1253. * =============================================================================
  1254. * Description : Clip all signed halfword elements of input vector
  1255. * between 0 & 255
  1256. * Arguments : Inputs - in (input vector)
  1257. * Outputs - out (output vector with clipped elements)
  1258. * Return Type - signed halfword
  1259. * Example : See out = __lasx_xvclip255_w(in)
  1260. * =============================================================================
  1261. */
  1262. static inline __m256i __lasx_xvclip255_h(__m256i in) {
  1263. __m256i out;
  1264. out = __lasx_xvmaxi_h(in, 0);
  1265. out = __lasx_xvsat_hu(out, 7);
  1266. return out;
  1267. }
  1268. /*
  1269. * =============================================================================
  1270. * Description : Clip all signed word elements of input vector
  1271. * between 0 & 255
  1272. * Arguments : Inputs - in (input vector)
  1273. * Output - out (output vector with clipped elements)
  1274. * Return Type - signed word
  1275. * Example : out = __lasx_xvclip255_w(in)
  1276. * in : -8,255,280,249, -8,255,280,249
  1277. * out : 0,255,255,249, 0,255,255,249
  1278. * =============================================================================
  1279. */
  1280. static inline __m256i __lasx_xvclip255_w(__m256i in) {
  1281. __m256i out;
  1282. out = __lasx_xvmaxi_w(in, 0);
  1283. out = __lasx_xvsat_wu(out, 7);
  1284. return out;
  1285. }
  1286. /*
  1287. * =============================================================================
  1288. * Description : Indexed halfword element values are replicated to all
  1289. * elements in output vector. If 'idx < 8' use xvsplati_l_*,
  1290. * if 'idx >= 8' use xvsplati_h_*.
  1291. * Arguments : Inputs - in, idx
  1292. * Output - out
  1293. * Details : Idx element value from in vector is replicated to all
  1294. * elements in out vector.
  1295. * Valid index range for halfword operation is 0-7
  1296. * Example : out = __lasx_xvsplati_l_h(in, idx)
  1297. * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
  1298. * idx : 0x02
  1299. * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
  1300. * =============================================================================
  1301. */
  1302. static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
  1303. __m256i out;
  1304. out = __lasx_xvpermi_q(in, in, 0x02);
  1305. out = __lasx_xvreplve_h(out, idx);
  1306. return out;
  1307. }
  1308. /*
  1309. * =============================================================================
  1310. * Description : Indexed halfword element values are replicated to all
  1311. * elements in output vector. If 'idx < 8' use xvsplati_l_*,
  1312. * if 'idx >= 8' use xvsplati_h_*.
  1313. * Arguments : Inputs - in, idx
  1314. * Output - out
  1315. * Details : Idx element value from in vector is replicated to all
  1316. * elements in out vector.
  1317. * Valid index range for halfword operation is 0-7
  1318. * Example : out = __lasx_xvsplati_h_h(in, idx)
  1319. * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
  1320. * idx : 0x09
  1321. * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
  1322. * =============================================================================
  1323. */
  1324. static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
  1325. __m256i out;
  1326. out = __lasx_xvpermi_q(in, in, 0x13);
  1327. out = __lasx_xvreplve_h(out, idx);
  1328. return out;
  1329. }
  1330. /*
  1331. * =============================================================================
  1332. * Description : Transpose 4x4 block with double-word elements in vectors
  1333. * Arguments : Inputs - _in0, _in1, _in2, _in3
  1334. * Outputs - _out0, _out1, _out2, _out3
  1335. * Example : LASX_TRANSPOSE4x4_D
  1336. * _in0 : 1,2,3,4
  1337. * _in1 : 1,2,3,4
  1338. * _in2 : 1,2,3,4
  1339. * _in3 : 1,2,3,4
  1340. *
  1341. * _out0 : 1,1,1,1
  1342. * _out1 : 2,2,2,2
  1343. * _out2 : 3,3,3,3
  1344. * _out3 : 4,4,4,4
  1345. * =============================================================================
  1346. */
  1347. #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
  1348. _out3) \
  1349. { \
  1350. __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
  1351. _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
  1352. _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
  1353. _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
  1354. _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
  1355. _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
  1356. _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
  1357. _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
  1358. _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
  1359. }
  1360. /*
  1361. * =============================================================================
  1362. * Description : Transpose 8x8 block with word elements in vectors
  1363. * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
  1364. * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
  1365. * _out7
  1366. * Example : LASX_TRANSPOSE8x8_W
  1367. * _in0 : 1,2,3,4,5,6,7,8
  1368. * _in1 : 2,2,3,4,5,6,7,8
  1369. * _in2 : 3,2,3,4,5,6,7,8
  1370. * _in3 : 4,2,3,4,5,6,7,8
  1371. * _in4 : 5,2,3,4,5,6,7,8
  1372. * _in5 : 6,2,3,4,5,6,7,8
  1373. * _in6 : 7,2,3,4,5,6,7,8
  1374. * _in7 : 8,2,3,4,5,6,7,8
  1375. *
  1376. * _out0 : 1,2,3,4,5,6,7,8
  1377. * _out1 : 2,2,2,2,2,2,2,2
  1378. * _out2 : 3,3,3,3,3,3,3,3
  1379. * _out3 : 4,4,4,4,4,4,4,4
  1380. * _out4 : 5,5,5,5,5,5,5,5
  1381. * _out5 : 6,6,6,6,6,6,6,6
  1382. * _out6 : 7,7,7,7,7,7,7,7
  1383. * _out7 : 8,8,8,8,8,8,8,8
  1384. * =============================================================================
  1385. */
  1386. #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1387. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1388. _out7) \
  1389. { \
  1390. __m256i _s0_m, _s1_m; \
  1391. __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
  1392. __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
  1393. \
  1394. _s0_m = __lasx_xvilvl_w(_in2, _in0); \
  1395. _s1_m = __lasx_xvilvl_w(_in3, _in1); \
  1396. _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
  1397. _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
  1398. _s0_m = __lasx_xvilvh_w(_in2, _in0); \
  1399. _s1_m = __lasx_xvilvh_w(_in3, _in1); \
  1400. _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
  1401. _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
  1402. _s0_m = __lasx_xvilvl_w(_in6, _in4); \
  1403. _s1_m = __lasx_xvilvl_w(_in7, _in5); \
  1404. _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
  1405. _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
  1406. _s0_m = __lasx_xvilvh_w(_in6, _in4); \
  1407. _s1_m = __lasx_xvilvh_w(_in7, _in5); \
  1408. _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
  1409. _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
  1410. _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
  1411. _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
  1412. _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
  1413. _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
  1414. _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
  1415. _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
  1416. _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
  1417. _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
  1418. }
  1419. /*
  1420. * =============================================================================
  1421. * Description : Transpose input 16x8 byte block
  1422. * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
  1423. * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
  1424. * (input 16x8 byte block)
  1425. * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
  1426. * _out7 (output 8x16 byte block)
  1427. * Details : The rows of the matrix become columns, and the columns become
  1428. * rows.
  1429. * Example : See LASX_TRANSPOSE16x8_H
  1430. * =============================================================================
  1431. */
  1432. #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1433. _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
  1434. _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
  1435. _out6, _out7) \
  1436. { \
  1437. __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
  1438. __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
  1439. \
  1440. _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
  1441. _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
  1442. _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
  1443. _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
  1444. _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
  1445. _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
  1446. _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
  1447. _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
  1448. _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
  1449. _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
  1450. _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
  1451. _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
  1452. _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
  1453. _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
  1454. _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
  1455. _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
  1456. _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
  1457. _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
  1458. _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
  1459. _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
  1460. _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
  1461. _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
  1462. _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
  1463. _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
  1464. _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
  1465. _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
  1466. _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
  1467. _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
  1468. _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
  1469. _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
  1470. _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
  1471. _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
  1472. }
  1473. /*
  1474. * =============================================================================
  1475. * Description : Transpose input 16x8 byte block
  1476. * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
  1477. * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
  1478. * (input 16x8 byte block)
  1479. * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
  1480. * _out7 (output 8x16 byte block)
  1481. * Details : The rows of the matrix become columns, and the columns become
  1482. * rows.
  1483. * Example : LASX_TRANSPOSE16x8_H
  1484. * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1485. * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1486. * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1487. * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1488. * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1489. * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1490. * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1491. * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1492. * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1493. * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1494. * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1495. * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1496. * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1497. * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1498. * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1499. * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  1500. *
  1501. * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
  1502. * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
  1503. * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
  1504. * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
  1505. * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
  1506. * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
  1507. * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
  1508. * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
  1509. * =============================================================================
  1510. */
  1511. #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1512. _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
  1513. _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
  1514. _out6, _out7) \
  1515. { \
  1516. __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
  1517. __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
  1518. __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
  1519. \
  1520. _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
  1521. _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
  1522. _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
  1523. _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
  1524. _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
  1525. _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
  1526. _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
  1527. _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
  1528. _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
  1529. _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
  1530. _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
  1531. _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
  1532. _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
  1533. _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
  1534. _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
  1535. _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
  1536. _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
  1537. _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
  1538. _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
  1539. _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
  1540. _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
  1541. _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
  1542. _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
  1543. _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
  1544. _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
  1545. _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
  1546. _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
  1547. _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
  1548. \
  1549. _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
  1550. _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
  1551. _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
  1552. _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
  1553. _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
  1554. _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
  1555. _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
  1556. _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
  1557. _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
  1558. _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
  1559. _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
  1560. _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
  1561. _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
  1562. _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
  1563. _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
  1564. _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
  1565. _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
  1566. _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
  1567. _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
  1568. _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
  1569. _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
  1570. _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
  1571. _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
  1572. _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
  1573. _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
  1574. _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
  1575. _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
  1576. _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
  1577. }
  1578. /*
  1579. * =============================================================================
  1580. * Description : Transpose 4x4 block with halfword elements in vectors
  1581. * Arguments : Inputs - _in0, _in1, _in2, _in3
  1582. * Outputs - _out0, _out1, _out2, _out3
  1583. * Return Type - signed halfword
  1584. * Details : The rows of the matrix become columns, and the columns become
  1585. * rows.
  1586. * Example : See LASX_TRANSPOSE8x8_H
  1587. * =============================================================================
  1588. */
  1589. #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
  1590. _out3) \
  1591. { \
  1592. __m256i _s0_m, _s1_m; \
  1593. \
  1594. _s0_m = __lasx_xvilvl_h(_in1, _in0); \
  1595. _s1_m = __lasx_xvilvl_h(_in3, _in2); \
  1596. _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
  1597. _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
  1598. _out1 = __lasx_xvilvh_d(_out0, _out0); \
  1599. _out3 = __lasx_xvilvh_d(_out2, _out2); \
  1600. }
  1601. /*
  1602. * =============================================================================
  1603. * Description : Transpose input 8x8 byte block
  1604. * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
  1605. * (input 8x8 byte block)
  1606. * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
  1607. * _out7 (output 8x8 byte block)
  1608. * Example : See LASX_TRANSPOSE8x8_H
  1609. * =============================================================================
  1610. */
  1611. #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1612. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1613. _out7) \
  1614. { \
  1615. __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
  1616. __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
  1617. _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
  1618. _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
  1619. _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
  1620. _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
  1621. _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
  1622. _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
  1623. _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
  1624. _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
  1625. _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
  1626. _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
  1627. _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
  1628. _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
  1629. _out1 = __lasx_xvbsrl_v(_out0, 8); \
  1630. _out3 = __lasx_xvbsrl_v(_out2, 8); \
  1631. _out5 = __lasx_xvbsrl_v(_out4, 8); \
  1632. _out7 = __lasx_xvbsrl_v(_out6, 8); \
  1633. }
  1634. /*
  1635. * =============================================================================
  1636. * Description : Transpose 8x8 block with halfword elements in vectors.
  1637. * Arguments : Inputs - _in0, _in1, ~
  1638. * Outputs - _out0, _out1, ~
  1639. * Details : The rows of the matrix become columns, and the columns become
  1640. * rows.
  1641. * Example : LASX_TRANSPOSE8x8_H
  1642. * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  1643. * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
  1644. * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
  1645. * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  1646. * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
  1647. * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  1648. * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  1649. * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
  1650. *
  1651. * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
  1652. * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
  1653. * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
  1654. * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
  1655. * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
  1656. * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
  1657. * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
  1658. * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
  1659. * =============================================================================
  1660. */
  1661. #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1662. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1663. _out7) \
  1664. { \
  1665. __m256i _s0_m, _s1_m; \
  1666. __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
  1667. __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
  1668. \
  1669. _s0_m = __lasx_xvilvl_h(_in6, _in4); \
  1670. _s1_m = __lasx_xvilvl_h(_in7, _in5); \
  1671. _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
  1672. _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
  1673. _s0_m = __lasx_xvilvh_h(_in6, _in4); \
  1674. _s1_m = __lasx_xvilvh_h(_in7, _in5); \
  1675. _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
  1676. _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
  1677. \
  1678. _s0_m = __lasx_xvilvl_h(_in2, _in0); \
  1679. _s1_m = __lasx_xvilvl_h(_in3, _in1); \
  1680. _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
  1681. _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
  1682. _s0_m = __lasx_xvilvh_h(_in2, _in0); \
  1683. _s1_m = __lasx_xvilvh_h(_in3, _in1); \
  1684. _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
  1685. _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
  1686. \
  1687. _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
  1688. _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
  1689. _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
  1690. _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
  1691. _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
  1692. _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
  1693. _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
  1694. _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
  1695. }
  1696. /*
  1697. * =============================================================================
  1698. * Description : Butterfly of 4 input vectors
  1699. * Arguments : Inputs - _in0, _in1, _in2, _in3
  1700. * Outputs - _out0, _out1, _out2, _out3
  1701. * Details : Butterfly operation
  1702. * Example : LASX_BUTTERFLY_4
  1703. * _out0 = _in0 + _in3;
  1704. * _out1 = _in1 + _in2;
  1705. * _out2 = _in1 - _in2;
  1706. * _out3 = _in0 - _in3;
  1707. * =============================================================================
  1708. */
  1709. #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  1710. { \
  1711. _out0 = __lasx_xvadd_b(_in0, _in3); \
  1712. _out1 = __lasx_xvadd_b(_in1, _in2); \
  1713. _out2 = __lasx_xvsub_b(_in1, _in2); \
  1714. _out3 = __lasx_xvsub_b(_in0, _in3); \
  1715. }
  1716. #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  1717. { \
  1718. _out0 = __lasx_xvadd_h(_in0, _in3); \
  1719. _out1 = __lasx_xvadd_h(_in1, _in2); \
  1720. _out2 = __lasx_xvsub_h(_in1, _in2); \
  1721. _out3 = __lasx_xvsub_h(_in0, _in3); \
  1722. }
  1723. #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  1724. { \
  1725. _out0 = __lasx_xvadd_w(_in0, _in3); \
  1726. _out1 = __lasx_xvadd_w(_in1, _in2); \
  1727. _out2 = __lasx_xvsub_w(_in1, _in2); \
  1728. _out3 = __lasx_xvsub_w(_in0, _in3); \
  1729. }
  1730. #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
  1731. { \
  1732. _out0 = __lasx_xvadd_d(_in0, _in3); \
  1733. _out1 = __lasx_xvadd_d(_in1, _in2); \
  1734. _out2 = __lasx_xvsub_d(_in1, _in2); \
  1735. _out3 = __lasx_xvsub_d(_in0, _in3); \
  1736. }
  1737. /*
  1738. * =============================================================================
  1739. * Description : Butterfly of 8 input vectors
  1740. * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
  1741. * Outputs - _out0, _out1, _out2, _out3, ~
  1742. * Details : Butterfly operation
  1743. * Example : LASX_BUTTERFLY_8
  1744. * _out0 = _in0 + _in7;
  1745. * _out1 = _in1 + _in6;
  1746. * _out2 = _in2 + _in5;
  1747. * _out3 = _in3 + _in4;
  1748. * _out4 = _in3 - _in4;
  1749. * _out5 = _in2 - _in5;
  1750. * _out6 = _in1 - _in6;
  1751. * _out7 = _in0 - _in7;
  1752. * =============================================================================
  1753. */
  1754. #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1755. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1756. _out7) \
  1757. { \
  1758. _out0 = __lasx_xvadd_b(_in0, _in7); \
  1759. _out1 = __lasx_xvadd_b(_in1, _in6); \
  1760. _out2 = __lasx_xvadd_b(_in2, _in5); \
  1761. _out3 = __lasx_xvadd_b(_in3, _in4); \
  1762. _out4 = __lasx_xvsub_b(_in3, _in4); \
  1763. _out5 = __lasx_xvsub_b(_in2, _in5); \
  1764. _out6 = __lasx_xvsub_b(_in1, _in6); \
  1765. _out7 = __lasx_xvsub_b(_in0, _in7); \
  1766. }
  1767. #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1768. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1769. _out7) \
  1770. { \
  1771. _out0 = __lasx_xvadd_h(_in0, _in7); \
  1772. _out1 = __lasx_xvadd_h(_in1, _in6); \
  1773. _out2 = __lasx_xvadd_h(_in2, _in5); \
  1774. _out3 = __lasx_xvadd_h(_in3, _in4); \
  1775. _out4 = __lasx_xvsub_h(_in3, _in4); \
  1776. _out5 = __lasx_xvsub_h(_in2, _in5); \
  1777. _out6 = __lasx_xvsub_h(_in1, _in6); \
  1778. _out7 = __lasx_xvsub_h(_in0, _in7); \
  1779. }
  1780. #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1781. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1782. _out7) \
  1783. { \
  1784. _out0 = __lasx_xvadd_w(_in0, _in7); \
  1785. _out1 = __lasx_xvadd_w(_in1, _in6); \
  1786. _out2 = __lasx_xvadd_w(_in2, _in5); \
  1787. _out3 = __lasx_xvadd_w(_in3, _in4); \
  1788. _out4 = __lasx_xvsub_w(_in3, _in4); \
  1789. _out5 = __lasx_xvsub_w(_in2, _in5); \
  1790. _out6 = __lasx_xvsub_w(_in1, _in6); \
  1791. _out7 = __lasx_xvsub_w(_in0, _in7); \
  1792. }
  1793. #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  1794. _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
  1795. _out7) \
  1796. { \
  1797. _out0 = __lasx_xvadd_d(_in0, _in7); \
  1798. _out1 = __lasx_xvadd_d(_in1, _in6); \
  1799. _out2 = __lasx_xvadd_d(_in2, _in5); \
  1800. _out3 = __lasx_xvadd_d(_in3, _in4); \
  1801. _out4 = __lasx_xvsub_d(_in3, _in4); \
  1802. _out5 = __lasx_xvsub_d(_in2, _in5); \
  1803. _out6 = __lasx_xvsub_d(_in1, _in6); \
  1804. _out7 = __lasx_xvsub_d(_in0, _in7); \
  1805. }
  1806. #endif // LASX
  1807. /*
  1808. * =============================================================================
  1809. * Description : Print out elements in vector.
  1810. * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
  1811. * Outputs -
  1812. * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
  1813. * '_enter' is TRUE, prefix "\nVP:" will be added first.
  1814. * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
  1815. * VP:1,2,3,4,
  1816. * =============================================================================
  1817. */
  1818. #define VECT_PRINT(RTYPE, element_num, in0, enter) \
  1819. { \
  1820. RTYPE _tmp0 = (RTYPE)in0; \
  1821. int _i = 0; \
  1822. if (enter) \
  1823. printf("\nVP:"); \
  1824. for (_i = 0; _i < element_num; _i++) \
  1825. printf("%d,", _tmp0[_i]); \
  1826. }
  1827. #endif /* LOONGSON_INTRINSICS_H */
  1828. #endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */