From 78be44d9112c84eac8e20fbd65ca680faf323d2b Mon Sep 17 00:00:00 2001
From: Shivam Khandelwal <shivam@aicrowd.com>
Date: Fri, 28 May 2021 03:39:11 +0000
Subject: [PATCH] Missed siammot/data/ due to gitignore

---
 .../build_augmentation.cpython-37.pyc         | Bin 0 -> 2400 bytes
 .../image_augmentation.cpython-37.pyc         | Bin 0 -> 6391 bytes
 .../video_augmentation.cpython-37.pyc         | Bin 0 -> 6125 bytes
 .../augmentation/build_augmentation.py        |  85 +++++++
 .../augmentation/image_augmentation.py        | 187 ++++++++++++++
 .../augmentation/video_augmentation.py        | 187 ++++++++++++++
 .../data/adapters/handler/data_filtering.py   | 140 +++++++++++
 .../siammot/data/adapters/utils/data_utils.py |  62 +++++
 .../data/adapters/utils/dataset_info.py       |  49 ++++
 .../data/build_inference_data_loader.py       |  56 +++++
 .../siammot/data/build_train_data_loader.py   |  77 ++++++
 siam-mot/siammot/data/image_dataset.py        | 232 ++++++++++++++++++
 siam-mot/siammot/data/ingestion/ingest_mot.py | 197 +++++++++++++++
 .../siammot/data/ingestion/ingest_prim_air.py | 127 ++++++++++
 siam-mot/siammot/data/video_dataset.py        | 195 +++++++++++++++
 15 files changed, 1594 insertions(+)
 create mode 100644 siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc
 create mode 100644 siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc
 create mode 100644 siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc
 create mode 100644 siam-mot/siammot/data/adapters/augmentation/build_augmentation.py
 create mode 100644 siam-mot/siammot/data/adapters/augmentation/image_augmentation.py
 create mode 100644 siam-mot/siammot/data/adapters/augmentation/video_augmentation.py
 create mode 100644 siam-mot/siammot/data/adapters/handler/data_filtering.py
 create mode 100644 siam-mot/siammot/data/adapters/utils/data_utils.py
 create mode 100644 siam-mot/siammot/data/adapters/utils/dataset_info.py
 create mode 100644 siam-mot/siammot/data/build_inference_data_loader.py
 create mode 100644 siam-mot/siammot/data/build_train_data_loader.py
 create mode 100644 siam-mot/siammot/data/image_dataset.py
 create mode 100644 siam-mot/siammot/data/ingestion/ingest_mot.py
 create mode 100644 siam-mot/siammot/data/ingestion/ingest_prim_air.py
 create mode 100644 siam-mot/siammot/data/video_dataset.py

diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb1cf3b9ca47c6a77d5d63802e13b9cbd6604614
GIT binary patch
literal 2400
zcmZt{%Wm67kV{e$MN76MyGfe|P!vH?6|kMA4bnrJM1I6kU`vKXrzz25OKEADHXqKC
z3KEw*DW?MYhxXW8zh`?1&{KXPr_L-TTQ0K1%+Act?#|AO8`Y|*!1(9!-@m>C>R)7L
zLkHp!K+FI!g%Mu~h)n|3Rs+q}0^Qbwf?WuTb}=Z~C4xGtZv<t#9GJEVyvB5YA*iSd
zQG%+y2$BU>V#XU{FYPF-%*;0mGu_%mv+GRnE6vJ#0#`LlVyW*r!84D!QO}LNGgqvf
z3pOJ^;!nLKak;oYCk;-y8^>N0w#K8t4HI$Yd~q*I=H%D@C*SbLe75K2`C88jSrj~p
zxOWzXiQ_-^y;D(>CRyAG<HLvtE^n&Bup-M1<A@`*vD3SA_wIXmgwnA@;U&Ics38ku
z834V#2@qG{rHo`6fDTYd2|rAgW967+MLDZkiK&?(X)P<uS<g&47qSI87qd#LGVMg=
zze~Q9X>v9)UC!mK5Z;oMIX!g&P9>QEPzEpo762+zr;=4+t{yLDi;`c;mXcapPR(>7
ztuXzqp4F22aYL>xOT3aUvBFy|t0$kNwd6{wPe^h#Edc&B)za!jO+HJDX(?TtXvwwI
zV8yp2xt`pZD4=mYtxw3vn2@ZIHX!z;&ItS>$CP;YP!znt8M#eem;>&Oj*<{UEQ}#U
zggbE}%-BiB+`;Gy?P%;aHK7Nt15BJSQBEQ{7;y+llL-BkM}y`~QD}Ge4=k~`*Y41M
z+uo*Duhs4V-+DgJn;X6M&J(M%-R}!yv)ci!z9r0l%R1<_tai5}v?mAKqO#XThVHia
z+LowIxsBa}9^LPCH$;82ySLwiLoSr67dLOik=2WfkuA3Nx?8PXVQ#go)^4}8wcQhS
zY3Ek^S-aofXz#YIpG4W}(mdBVW|ttAD0d(u9UqIu+;2bM-lcont&S+?bl=(%rOhZf
zjbfJ<VGGCV<^@D5q7otlQ5jHkS!lz<kubcNf~&pIHiC%3>B(!m63B9*z883jU7K<P
zs3M9!w(CQ<W?GB6k}v`<q_U2Uz<DJpd*u)+iXNR8nB2@4(SjvJHAHnlcF~b{^IIrc
zK}kav79C<MpvSTo*vsMU{b+GsppXJH@-KzohOHl7t{+8#yFMO_!(_Z3Lz`KJ7pyxT
zA4EKKSDBl*LrkdEIB_^Mnbi|7xzNK6sL!0lS$CLo3L5dcleZk1i0gx~=QBDlT04C$
zuI6Ige`yW99b14DRVVpp5CbU0@TvK1YB<j{^jxZ{B>xOeCo7r?d=Wl_n7B`?X#5VG
zJH*3h1GD83K>Q^jSFs6GY?TqFzENzA8BAvd=*~JTvJyxNOoPfS^RGd9h-rcYX9U^C
zu%D$QRVXxnP$eBGHS^r*G4w}0cKt*CE$GPr<mek<g+ZwodI_aFz{T=-a#our>Dp|u
zbKXBOV-e)VX#cAl*oC%~UmDQ&G%!P*s6VS2$<!Cdk7$;9s7{TN1~hg49c<%OK%xlY
z9l41h$w%xOI=)ZoQ_zdCSQJtrjZ3!W3Pb!|;Xt=yElJZJ03q8WVdSckkyqG)RWS2f
zrl#QPdmw{Z`$J3BKM8U#R?u9g=PO`xq4R-!C5U-e`W(h@x3e?VY(g6>Q#$mWIHnZ6
z7dnAUseGC!;t$D5nwor~sOBt<6Yhm0eh*Cg$TPWn$|>!_Vq5`)&#XZDGyvuUk*i^F
zi~K&Q36oMN$*~W&iwdPL#*RO=<t{Aq?-4yjge5nJFzhm!9s?mg3t6q11u1K)7Dp?Z
zjahxq*HGT2(9d(73uAN9x$K7gTU;tH+08g+sr^OZ#3y_hhIHVD!=u3ACu>;7YZvJ%
sbQI*tiU}s`M--#MG4up!<lJNA+ww%dpL^p0rbCQP^p;jr8=CpUe^m`{q5uE@

literal 0
HcmV?d00001

diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c18d2abe5bea310ca77bdeb1bc67d7fc0d3a1a7
GIT binary patch
literal 6391
zcmbtY&5s;M74NG4oX?$Ij~(0Lqj8+z39*fhf&zr#*u++%utbg#QY)y_JKeL}yEEOB
z>Kc1?)pNn-z`+t1Bn}AeB5^=MfB*^p1}-3u%?(cbFK`LJS3T1^yWSwAbgQcCRlln0
zdLO^{s&BVibq&`)@BaS7zn#~#e-p8|3^Z<`WWPZbY9SkHUioAmQ#AK<Jo(TVnV!k$
zoj$Ziw&$oe8`eg3uc6w;usLdZEvDVmLNm0UXrUFgAM4%<XghR3JBsdru7!2bbw#g&
zZiG$HO+~MPZiQ{oZAG5}y%KgncNE<X+51{=^$=gIt@l`AzC8*yBQ)xRNz#{bngl~o
z_U&}~Zk)+J@hvVLjaw+$OQ<4^_UQ4Dh5U(zed?isrx7}#71~djXND$Lsuc~j!u=?V
z_o6<Dj4paoxr36uj7n-blT7jff6TLP&IWA8a+Yhpo^#YjuA??X4O6o9o>kQ2EKV{R
zB>ku`hWOILl#}r=Dx6WA__P*U!L)3x677faLk!x8hq2u48HJHW!-1$_Z9ys)4OG9@
zL<^7qJiNY@j-u<6jY%RW*RwbnT^*(JdJv0^R3y>WFp`lH)>V9hkde5$6U!q$8bJJE
zAcN~c7>vP@T@NOkqbQL93Hf?VV)vK3u8nsK$M@qTmcCD4ktxe#ZDy%^ue&73YYWCc
z2Pa*`TuJmFAaeErpYfS4wSCBOn;)>5fu}w*bI5V0CI-@wCZwxF=Ik+>S)i>Y+Rp9V
zdS1(ou9llS2Ka61<YsPxwx8G37=!qCH1Q-iW{$LT=m~E*y`9yPE_!RKm!nr#y$-&i
z7BbBDSG=haMg8>bYKmKNEcH3yat0il=j(!DXT5r1i69BnQDODB(zqWTsj)Bz!!(eE
z4l_U%Obb0uU>qWyB%#uAuP&OH1Qw)JpiG55D%ch*0%oF&!CNION=3XG!yl9pp$q18
ztgGPZBa|&=GiBALnVro@sxfClWtUKC%wbJv(P15Rce%q1w#E(C;dRz!U4C4<iwV~I
z<a2Z#bC}oBk!$-<@iv26b;<X2IE#M3;WQk|b;K3WaHjX640@;d_@a+o$Pk~!EaD=n
zg58Bcm4l*N3CC2zDJ)nwzD038vrFIa2g9N7zk(?;8l_>6P3)2OdVD_%*3x%j<w-D#
ze7~ss{wNJ6LuxmD|D#DTEZ>L~Ojmv($!}?G@(e*@7A7iMHnUJ+RmXWw^_`AcZ-D6t
zvw&Bv&<IW7lo0`Mf6f%j8lE+$2xAS;F0s{fLNdR0v7%h?j!4HPp6$I=cX3gN8@S^@
z_P5-DNJp;Rid-c^7k28FvUO$Z#xipkHsq4IsqyP~D?O9G$))e0WF1rr1ffO1AMmz>
zKVH=PSkfQHqgWPJ02u<cb)`D-MGO*Gs8XUjk{5~eYbY7nk_KzsJAYz36{;Wt=o{&@
z0v87ZDscG&2v`(g<v^Qp*c%Jkk9Y`cy8xI8VP)HTh<2!-*FxhdAgJO2)GYDnOFRIQ
z`8?(^p5=J}R&yTf7|-%N763}$F>)(6c8puv%!d5z3nVvtE^JcR{b-avj9hFn%Al)2
zx`Bw?B$ciT5yQy6l1`+X4(3YSAoK{tZqE${z;YMh4iI;j;ae$^<}NYnPyrPeXCN%m
zMRiD8JtXx~?AgH5h1nmbNkoreG*&*=b4nfdXw+3yUTsQ9)E~xUueN*CoDvu<dgsl;
z*omU?39T1hq4tWnGT)kyrN4=(Gjbx$sI$xSyTkV`pEQWm2KUL|=%Q_r6DnK%3mSlb
zejdRhhvvC9vq<{;+JW`~g2SvftIry_LmaavARRojR$f<hJ8vj@MY3(4H}h8B&Q}Ho
z<rc7?j@0uGG-?kxWf@QP+3NHXtjyRq543M<Kj<RRt<BEltNT{ImY+G`O#A5J_kobx
zw%7+wI{7M4ay4n@s|Y>t_NSo0UB?`a1ImZ&y-B4%_swv7l1b&l{WKLJLV%3O7RafU
zCB(@l*^c`tmRoUhG_v$~a!ofINBwvZN1?m1>t31Oc*WgCncl?nX0IA?_%SYWwlE$Q
z#&mbO3&MaqM-R|w&ZQrX#>iNjMaR#^NM4{ynfk<xoF^6itwQeyQjmWZ&1fn`;A7!o
zm=$I$qfu7Wf-o$*N?f=>oGr2*9}}sMF6@4qJdA`Cui|B|dt`<MpWZ0=F3R*Ko;L-u
zU`<@0DGajB!XUGxHexN>sGB5#aTNOMeMKq0ic<U)MdQ4RQr^*6%tEfhb_(|Jq-i(t
zhEhMDIN!k+WuHT(S!;-~6ou%fD7%IT+u;^(>WE99Iceg{lT<Lgig^h1lxhDAzD$4z
zGt*~=M4&~$<p*S7$acDv>zL>@p6RuS_aygu;<t&PG9u(y=6TGKThCFyBjG}i0jN91
zPnfur^KElh%WENy4hx8_C`KR%C#qLH3SKM9n7?npx$4NRn$p^~^Cq0igj2Qf1@*iI
z2u41KmD`{jOZ1A$_1cd)_U6F(R?xC!4d1KhZR~+#56H`~ixovtW@ztd`0B5M+o4wW
zby@!*la2i*K$w6T^8g;yjt@+@bz{E;$A%mR9H~p^*xsMOaoY%c$Tu;<0Y5?w_@M_l
zGsifU-t_DeZ^f6OEkUV>co|iXiLc<XiGu6z-|DTDISgfFg2Ji_lZ8Hv5{0wkHS{5p
zicJ{e5oj0fH?s_00kD5dh*StNu)-jF6Q8G9)=~9VOK)+h;|o+(F5($-=ucI>>lqt@
z+NgI00la6z^!!KVW2<n8LKq6XCcZ(n{5YI?Yeba+rwsTQ@@(IhxKAse2>FydMJU;q
zQE9Bs&#`mtB0taRP~qZsaJ)cDiYQP=tBp4oLATi&Yx2|5gEHeoB7V156E9#+aha+Y
zQ5ClDhiTvU%iN9NND$H}8HrEK;$8v^VU9ZOIJ&{=M!VI1UQmA@rMf77-a^S-RL3$o
zL<M#7q<jui4J|_Sd9L;y6{&9zCt~l0g5CEbF-XPev=B}ST9A|>U-9-Sh|{H`(1sKo
zkdaAk8%`Qx<xQ?~9EKh~k1Z<C-lU98R~ehVen_XY_+J3~L$aVA7w_Uxm_f2RjCy=-
zrj@;Fw}Ad;7Q+~kEa-6%c=e-=6wQ0VB*Pgqp=}lI4b(oJ-u-c!F491+F*g}{TeLRk
zBYYxHS_vhFPf;>LJWc7ozR>>BvQ=_eWra4lsVJ$?hUlm%xzs}jSJt*o91#I#%3VR_
zfGeC;tbJq!=!4rMG8<@_vbJAGe<L>!=#7<=1{a6DX6dhl$bweGSq@^0#CNIV1M1+(
zxUhpP5F*(1TJxl=jBA3drl?U)u3i`B1kv-go_*x9o~;BJhhD9cWZ@*^asscVB!*yC
z#>v_^9?qve#n*m{)#gW`i|{DAJBu{?m?TR-!jFs-P9z@CuGHb@6Fug8VEHjEM0pr_
zjN=IE`&8c2DZ^rlL^OhJrODnmj=9S_@4a=;?H|29mis>II3#wu=t)JH2suG{oLL-X
zSg}IrwlptXqX{-Kpz4H#Ni4ra$<*v@@50G%KHWsC6m?;u<hY18T*7^kTEbDc@sERy
z73OIoiB#q(?QLhjM$rkprmXKb@UD5xM2&gup@?%BqUyDju@?3ui3w%ONOBf$$_@F{
z`j9FQl~)f0GRRRR<G$Bg2*KYTN1MuSML?VzsFv-O>>mNCXe<UmS%P%$brwAfYZdfO
zC#84^eZNAfj_h59{jk&1<oRg~l_JkCXr;<fbwPc{3{^+REn+%{s)BWcpcRL6KnbA0
zESv}FgDA;Tar8%o22#00gXt`zesn=~^TYuEICvSaYshk?8of9pyI?Ov^$8Or;rtF|
zVPdO`bzxfb*r)!Tyg(CNrs_qiXxNgR)uiel#SUKl7DdhHFhO)5rMhSbw@{8pHM`or
zqy7ftFGXlT5H%9S?R)oczWVBe3Whm`Q&ECA&Bti}8=C@y7T%SBed&AEr;>O531w;P
z<w})T9hZF-4_yp`3nA^v7cr4e<ZvnV3%5+u%=I!uBUe#B@eV&m3uzjC%KxR7KZ8+O
z68*YZV|!3d^?eJ~;MBC<Gf!gb|Ax~M=KUkhO&fApT^H2TJF8N{(w)s@G8*p+^4>y!
z@9lTXsB%snzUWk)Cf9K4N-}c!Opr>I;d!g^2!VI=+B_uTCoo*T{0oe>!hquCQIPG3
zev<ebQPST+nA*9PNipfm36S{O;=D<x_VS2lr5oG$`K*3*sVqr>zj7>PUh4Cglv^2N
h;k;h{Y<q*!nv9av@{EMPb{x0f>2)4-&UQK-^S{D?%4Glm

literal 0
HcmV?d00001

diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35bc4d792ce302c33d116e8363ab049962ede5b3
GIT binary patch
literal 6125
zcmbtYO>f)C8Rqay6fN7b<9hS8T>%@U5xTJ#+pjc8H;$b;>9!S$ChY>k7F=4SY&sID
z3@NW;$UspC-Amg`(E{yBzO?9}Ko7n2udtT_Jrw&Fdg}AOq$pXI6CkB9qnY7wIP=c)
zJnuW)oS(N9T>squ+ozAND9XR7GP-mWHjv`KBl8qbbd+ZHFPZ{%wWD^mrY@*WL)mDW
zDC?flDRr%;)h##6UAt*_=bCdud8BxzS9+;<CBJf_HRn;cyfW%#-$H!>b=#XmeU9%h
zqF(XlQJ?os+^?d(;4PxQ$oH2}uX;<UFL8Yt^=0o8>X*2F$rDeM`pOrWxzebM)Nc2~
zRuV*^+d*w^E9yk@qaaCq`Gr82x}MH`uj}smKlbC`$WIr!vL&P5nT8)mG{k17FVkw)
zBkJ~~AIE5I(W^$6hQbC?Ttnt7WUi(P1E`+%QZzNML}o%s=V@AL^{0X7M^B|2#_dRU
zec5QKc#<yqr?iC>Uqv>+V9zXBO|T;;;um5d28x3=tXFLu-7E&Fjh0-C_o9BstA$Zg
z+wp6SDD>CrYHEGofAL)@Bbio`(TkmSh&y`hciM6e@A``(=kfR7gY~_r>#z5B`eD*v
zj{~=RyBj6zZXkCe8Tz+9Kk?bTx8uZ>u<Gq+K{D<~4^V&4P26?Yb9?9!ue<%-t{)~Y
z*>wGY4B$*Qt@U1{mg59rkT?!aCN_~N!ou(9%Cy1OMm<|(d%8+U1`->tjY0y`B_dJV
z>WLa(9VkN$rc+PVC(5pJtRd-#cN3l3PSl6W&))k)JvNR_?q>|lQ#G$^Lt|i^3ZV>i
zluRl;U;9`YXal{CvFhUJcXlnm9@J>bwZgu<*4k&%?e%<F!&cXL-9`VqJy*KjtnqHm
zt#yJpsYUHtK-i3@yTR1Eelpo_GdlbMJ#veN=ZEUWLv=H?gE$DWLt)ELbs9T0l76q_
z^IoUgQP4|Gnp)`9OR@r!r3TM0HM3Wx<<S4k$(n2`=FJAbDy{He-pIhovIQCmTiJ3u
z9mlDn7B3@H%mq<Vb)kzDQ5CjWP-kvy_EcSf#*Ml`<AiS4cbwFAoNnaxJ5;VX&hx(8
z$?nKS3@)!xM(CZ$XeP3Zj8rcsfwP2da7zo7;-Wir#a~A@;_v55TX@1#Us_<W#WiAa
z87wZdxQV9mxuPpg6DY9L#V0^L!!*;7)@V@rCtwm#<IBhrB>{&|)S*h?8)&B*X(~af
zQP)x<?qfIQJLt#@tLxd;Slu9WSgowCd&s5c64f0qIKW2l1f3vx0ktc`@*Ovh9cR29
zJkkxC0PBo6x;!=8f=`WtP!_rq1$o}TLE#LcYKaE&&?<}(trBwph4RERBYr<AL8+9X
z1@%&|DEsA8kx>q`GY;=1c0s!}(9Aq8q3_(l1gT1=THc}$D;eDe21+F+-ct}y9~ewL
zts%*@k_CNU63*ljWdt@^ql}G6z(}=@AEpZr;@Fp@KP-ez3v|j%s*@q*CF)snpzu^%
zL9K4F@#U9jv{kxo5fAwOThvDTDd9;KNm^YO@?9!33>kd%@~<Gp^su5Xh-I}ZsyPdm
zGZxI)i;M(kAVh23qLFII5@kq^0BUwD1{#(3RWdQ!v`m!<N@6=T^Tpi6LphSN=~4r)
zXqqlG+dD|P{8mIo9KCf8E!nU+OH1-PjrKNWU#4tKNE$7j&k@=3Nf5>_9GQzPMSDfY
z<M&#kqAe`({<Y9nTpCInNM{{wsc^KGXLu$%S{-~Ur*DrZ?uK5}eH_W)D1sB}Y<Gg*
zf3k7^b$_<^CDDLE6{ldQNcEnKcIs+|iZyf?uSW7RXzBNnMnt>$${Ea{T?r2p?Rc*b
zP&h-oRoZph;gJtZw4rvY43vHSnIhYV8wt~qJ=(3pgPBPQRD>7H{g^p0hh+~nCe@vf
zl@LS)demdR@M~%#W0pO|EI^7)DS7oo8D@&Cqm&wZZP+y}vj^-&v7bAZF@fX^+DzM!
zNVADlkj5gyfIQ?lR+&6UueIyQkI35?iG9h%D4pdt-os-b(j<s@a2uMWmNRIiJ9Ndj
zkd54iXR@oH(62lh`mw7pfJ!S}DKIe+m^NDEB<Q05LU>75rX9vWKjknycNv-1eSnUc
z!!R<3VeiObcP|MMuclTDpqDNp06TV*zU06p)%N=Sh=XIvA_V^gDMPSvH0IF6YzyEP
z?t<7-;PzLj%*HM7iYS=1N`dQ8_7QRIL;m+Ir1GhLtdomCG7mQsJDDTApQtAS5!(B}
za|y{hE)Plr?U^b&a0b?iI0m-iA}mI?i4xomqO>2uSs-F#JyaVYEqMy|8TE7SVjZ_q
zv$Yomh?Q*e3mlro@@w=Ag$42(lzo%>mHn`fpxs4STj0NLPkRb2@D6h(P>??ceky{o
z%b11zPw*0s{mEfis;!a;Q{Z%+qhSu6F4~nG_Id*sq29b0^<Sk070@36fmMva^ps{n
zyK1S5px=Tfsr?KEE)rzT)A{5x7Gyf0mDAM%`q|;?L3Su;Y!^795h-mU#Y8ze_#m)Q
z>DZGWhOT;COOWp~K5$Rt5K3$8{_ziV{o{TnUwWz`a7rsUkHGIcouC^el5m<X<W2b@
zq^@NfNaBsK7@?-y8Cz#8?$2pn6dNhn7E74o=$+YXI!_UbM2jQJFHxDKh`fx&Xo#Lv
zSvx24IC^;G;EltN5{UT(Deiy}CJ_T9Fb4*nZgbpK0v+fGL+#oSgvq79oYCa@=EM;O
zkrW3}lMI=beMxqaL=*Wf%Dzn*qkn@+94wWmC@_Y+yi1+#QC0*?)5ay$KSat@U%L$C
z1M`~AsLu}!7n?fAz1rA*1FioKUfZKr=UQKq)Y3vKE`sC+QcMBPSnG?(gN_DDe!wrK
zSEnFB-w;MnpB-$}Ff6te!A6ep6`&rqM4Z68J8FL5-$ggR5Z+aD0PkRMhvE`RvBTMB
z`Y138SBO#5MklfVHByFXpqUZ<?#0tTk7|;L0@c4q<qc5XoI>^EbErN@^r7WJRuJ-*
z2W8?40-gh4ow1D*gf_Gbl-n~<ZZyWmn?U-6SD2d$btGGC2IkXJKMYvZA7Z^3_^0*%
z)!=W=Wl>bNZ-oeFGDA%GEmH0Sawg!gK>A<7KmBlge)Tqu0wA2CFZKcBH*<-_pF%Hk
z;3GDel3M-`He4&<v^D;K_G&uZz#C`N0Xwgp5H)5|Vv&x<t?^4F33XbXXmKVP@d$Tk
zG2+i?Kg)<4uUNu)N|4Z#ln{R6pCfx6B>?VB;Q6~0ACezVQR0I$l%VM6(@f~$7E1Uw
zHIjG(C60tx;>U;Onczp%GIefdlgxm;e2isWAoDEl<GB9yGXGbyOLihAaN9+2lLPxT
z)_!ATZ{xjxqIc&yZ%S&Zui(2_BwKsz-kPgircWwSxK@11i=~RLe9Dm;O3l$1;o{43
zx}1Fv%i{?Su_Xy3ha0TT863@MXCI$W`7>1ZiGso$j;+}5Fx_}&6WA>1nOgU<&tM;t
a&t@yqCsKrn>SeuprFyNpTD^|n)&Bvs!V(Gq

literal 0
HcmV?d00001

diff --git a/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py
new file mode 100644
index 0000000..147c04c
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py
@@ -0,0 +1,85 @@
+from .video_augmentation import SiamVideoResize, \
+    SiamVideoColorJitter, SiamVideoCompressionAugment, SiamVideoMotionAugment, \
+    SiamVideoMotionBlurAugment, SiamVideoRandomHorizontalFlip, VideoTransformer
+from .image_augmentation import ToTensor, ToBGR255
+
+import maskrcnn_benchmark.data.transforms as T
+
+
+def build_siam_augmentation(cfg, is_train=True, modality='video'):
+
+    motion_limit = 0.0
+    motion_blur_prob = 0.0
+    compression_limit = 0.0
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        flip_horizontal_prob = 0.5  # cfg.INPUT.FLIP_PROB_TRAIN
+        brightness = cfg.INPUT.BRIGHTNESS
+        contrast = cfg.INPUT.CONTRAST
+        saturation = cfg.INPUT.SATURATION
+        hue = cfg.INPUT.HUE
+
+        if modality == 'image':
+            motion_limit = cfg.INPUT.MOTION_LIMIT
+            motion_blur_prob = cfg.INPUT.MOTION_BLUR_PROB
+            compression_limit = cfg.INPUT.COMPRESSION_LIMIT
+
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        flip_horizontal_prob = 0.0
+        brightness = 0.0
+        contrast = 0.0
+        saturation = 0.0
+        hue = 0.0
+
+    amodal = cfg.INPUT.AMODAL
+    SIZE_DIVISIBILITY = cfg.DATALOADER.SIZE_DIVISIBILITY
+    to_bgr255 = cfg.INPUT.TO_BGR255
+
+    video_color_jitter = SiamVideoColorJitter(
+        brightness=brightness,
+        contrast=contrast,
+        saturation=saturation,
+        hue=hue,
+    )
+
+    normalize_transform = T.Normalize(
+        mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255
+    )
+
+    transform = Compose(
+        [
+            video_color_jitter,
+            SiamVideoMotionBlurAugment(motion_blur_prob),
+            SiamVideoCompressionAugment(compression_limit),
+            SiamVideoMotionAugment(motion_limit, amodal),
+            SiamVideoResize(min_size, max_size, SIZE_DIVISIBILITY),
+            SiamVideoRandomHorizontalFlip(prob=flip_horizontal_prob),
+            # PIL image
+            VideoTransformer(ToTensor()),
+            # Torch tensor, CHW (RGB format), and range from [0, 1]
+            # VideoTransformer(ToBGR255(to_bgr255=to_bgr255))
+            VideoTransformer(normalize_transform),
+        ]
+    )
+    return transform
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target=None):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
\ No newline at end of file
diff --git a/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py
new file mode 100644
index 0000000..adbc582
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py
@@ -0,0 +1,187 @@
+import torch
+import random
+import numpy as np
+from PIL import Image
+from torchvision.transforms import functional as F
+
+import imgaug.augmenters as iaa
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class ImageResize(object):
+    def __init__(self, min_size, max_size, size_divisibility):
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size
+        self.max_size = max_size
+        self.size_divisibility = size_divisibility
+
+    # modified from torchvision to add support for max size
+    def get_size(self, image_size):
+        w, h = image_size
+        size = random.choice(self.min_size)
+        max_size = self.max_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        if self.size_divisibility > 0:
+            oh = (int(oh / self.size_divisibility) * self.size_divisibility)
+            ow = (int(ow / self.size_divisibility) * self.size_divisibility)
+
+        return (oh, ow)
+
+    def __call__(self, image, target=None):
+        size = self.get_size(image.size)
+        image = F.resize(image, size)
+        if target is None:
+            return image, target
+        target = target.resize(image.size)
+        return image, target
+
+
+class ImageCropResize(object):
+    """
+    Crop a patch from the image and resize to its original size
+    """
+    def __init__(self, crop_limit=None, amodal=False):
+        self.crop_limit = crop_limit
+        self.amodal = amodal
+
+    def remove_invisible_box(self, box: BoxList):
+        """
+        Remove boxes that are not visible (out of image boundary) after motion augmentation
+        """
+        bbox = box.bbox.clone()
+        xmin_clip = bbox[:, 0].clamp(min=0, max=box.size[0] - 1)
+        ymin_clip = bbox[:, 1].clamp(min=0, max=box.size[1] - 1)
+        xmax_clip = bbox[:, 2].clamp(min=0, max=box.size[0] - 1)
+        ymax_clip = bbox[:, 3].clamp(min=0, max=box.size[1] - 1)
+        keep = (xmax_clip > xmin_clip) & (ymax_clip > ymin_clip)
+
+        return box[keep]
+
+    def boxlist_crop(self, box: BoxList, x1, y1, x2, y2):
+        """
+         Adjust the coordinate of the bounding box within
+         image crop specified by (x1, y1, x2, y2)
+        """
+
+        w, h = (x2 - x1), (y2 - y1)
+        xmin, ymin, xmax, ymax = box._split_into_xyxy()
+        cropped_xmin = (xmin - x1)
+        cropped_ymin = (ymin - y1)
+        cropped_xmax = (xmax - x1)
+        cropped_ymax = (ymax - y1)
+        cropped_bbox = torch.cat(
+            (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1
+        )
+        cropped_box = BoxList(cropped_bbox, (w, h), mode="xyxy")
+        for k, v in box.extra_fields.items():
+            cropped_box.add_field(k, v)
+
+        if self.amodal:
+            # amodal allows the corners of bbox go beyond image boundary
+            cropped_box = self.remove_invisible_box(cropped_box)
+        else:
+            # the corners of bbox need to be within image boundary for non-amodal training
+            cropped_box = cropped_box.clip_to_image(remove_empty=True)
+        return cropped_box.convert(box.mode)
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        tl_x = int(w * (random.random() * self.crop_limit))
+        tl_y = int(h * (random.random() * self.crop_limit))
+        br_x = int(w - w * (random.random() * self.crop_limit))
+        # keep aspect ratio
+        br_y = int((h / w) * (br_x - tl_x) + tl_y)
+
+        if len(target) > 0:
+            box = target.bbox
+            box_w = box[:, 2] - box[:, 0]
+            box_h = box[:, 3] - box[:, 1]
+            box_area = box_h * box_w
+            max_area_idx = torch.argmax(box_area, dim=0)
+            max_motion_limit_w = int(box_w[max_area_idx] * 0.25)
+            max_motion_limit_h = int(box_h[max_area_idx] * 0.25)
+
+            # make sure at least one bounding box is preserved
+            # after motion augmentation
+            tl_x = min(tl_x, max_motion_limit_w)
+            tl_y = min(tl_y, max_motion_limit_h)
+            br_x = max(br_x, w-max_motion_limit_w)
+            br_y = max(br_y, h-max_motion_limit_h)
+
+        assert (tl_x < br_x) and (tl_y < br_y)
+
+        crop = F.crop(image, tl_y, tl_x, (br_y-tl_y), (br_x-tl_x))
+        crop = F.resize(crop, (h, w))
+        if len(target) > 0:
+            target = self.boxlist_crop(target, tl_x, tl_y, br_x, br_y)
+        target = target.resize(image.size)
+
+        return crop, target
+
+
+class ImageMotionBlur(object):
+    """
+    Perform motion augmentation to an image
+    """
+    def __init__(self):
+        motion_blur = iaa.MotionBlur(k=10, angle=[-30, 30])
+        gaussian_blur = iaa.GaussianBlur(sigma=(0.0, 2.0))
+
+        self.blur_func_pool = [motion_blur, gaussian_blur]
+
+        pass
+
+    def __call__(self, image):
+        blur_id = random.choice(list(range(0, len(self.blur_func_pool))))
+        blur_func = self.blur_func_pool[blur_id]
+        np_image = np.asarray(image)
+        blurred_image = blur_func.augment_image(np_image)
+        pil_image = Image.fromarray(np.uint8(blurred_image))
+        return pil_image
+
+
+class ImageCompression(object):
+    """
+    Perform JPEG compression augmentation to an image
+    """
+    def __init__(self, max_compression):
+        self.max_compression = max_compression
+
+    def __call__(self, image):
+        ratio = random.uniform(0, 1)
+        compression = min(100, int(ratio * self.max_compression))
+        np_image = np.asarray(image)
+        compressed_image = iaa.arithmetic.compress_jpeg(np_image, compression)
+        pil_image = Image.fromarray(np.uint8(compressed_image))
+        return pil_image
+
+
+class ToTensor(object):
+    def __call__(self, image, target=None):
+        return F.to_tensor(image), target
+
+
+class ToBGR255(object):
+    def __init__(self, to_bgr255=True):
+        self.to_bgr255 = to_bgr255
+
+    def __call__(self, image, target=None):
+        if self.to_bgr255:
+            image = image[[2, 1, 0]] * 255
+        return image, target
+
diff --git a/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py
new file mode 100644
index 0000000..0f267bf
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py
@@ -0,0 +1,187 @@
+import torch
+import random
+from torchvision.transforms import functional as F
+from torchvision.transforms import ColorJitter as ImageColorJitter
+
+from .image_augmentation import ImageResize, ImageCropResize, \
+    ImageMotionBlur, ImageCompression
+
+
+class VideoTransformer(object):
+    def __init__(self, transform_fn=None):
+        if transform_fn is None:
+            raise KeyError('Transform function should not be None.')
+        self.transform_fn = transform_fn
+
+    def __call__(self, video, target=None):
+        """
+        A data transformation wrapper for video
+        :param video: a list of images
+        :param target: a list of BoxList (per image)
+        """
+        if not isinstance(video, (list, tuple)):
+            return self.transform_fn(video, target)
+
+        new_video = []
+        new_target = []
+        for (image, image_target) in zip(video, target):
+            (image, image_target) = self.transform_fn(image, image_target)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoResize(ImageResize):
+    def __init__(self, min_size, max_size, size_divisibility):
+        super(SiamVideoResize, self).__init__(min_size, max_size, size_divisibility)
+
+    def __call__(self, video, target=None):
+
+        if not isinstance(video, (list, tuple)):
+            return super(SiamVideoResize, self).__call__(video, target)
+
+        assert len(video) >= 1
+        new_size = self.get_size(video[0].size)
+
+        new_video = []
+        new_target = []
+        for (image, image_target) in zip(video, target):
+            (image, image_target) = self._resize(image, new_size, image_target)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+    def _resize(self, image, size, target=None):
+        image = F.resize(image, size)
+        target = target.resize(image.size)
+        return image, target
+
+
+class SiamVideoRandomHorizontalFlip(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, video, target=None):
+
+        if not isinstance(video, (list, tuple)):
+            return video, target
+
+        new_video = []
+        new_target = []
+        # All frames should have the same flipping operation
+        if random.random() < self.prob:
+            for (image, image_target) in zip(video, target):
+                new_video.append(F.hflip(image))
+                new_target.append(image_target.transpose(0))
+        else:
+            new_video = video
+            new_target = target
+        return new_video, new_target
+
+
+class SiamVideoColorJitter(ImageColorJitter):
+    def __init__(self,
+                 brightness=None,
+                 contrast=None,
+                 saturation=None,
+                 hue=None):
+        super(SiamVideoColorJitter, self).__init__(brightness, contrast, saturation, hue)
+
+    def __call__(self, video, target=None):
+        # Color jitter only applies for Siamese Training
+        if not isinstance(video, (list, tuple)):
+            return video, target
+
+        idx = random.choice((0, 1))
+        # all frames in the video should go through the same transformation
+        transform = self.get_params(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+        new_video = []
+        new_target = []
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                image = transform(image)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoMotionAugment(object):
+    def __init__(self, motion_limit=None, amodal=False):
+        # maximum motion augmentation
+        self.motion_limit = min(0.1, motion_limit)
+        if motion_limit is None:
+            self.motion_limit = 0
+        self.motion_augment = ImageCropResize(self.motion_limit, amodal)
+
+    def __call__(self, video, target=None):
+
+        # Motion augmentation only applies for Siamese Training
+        if not isinstance(video, (list, tuple)) or self.motion_limit == 0:
+            return video, target
+
+        new_video = []
+        new_target = []
+        # Only 1 frame go through the motion augmentation,
+        # the other unchanged
+        idx = random.choice((0, 1))
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                (image, image_target) = self.motion_augment(image, image_target)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoMotionBlurAugment(object):
+    def __init__(self, motion_blur_prob=None):
+        self.motion_blur_prob = motion_blur_prob
+        if motion_blur_prob is None:
+            self.motion_blur_prob = 0.0
+        self.motion_blur_func = ImageMotionBlur()
+
+    def __call__(self, video, target):
+        # Blur augmentation only applies for Siamese Training
+        if not isinstance(video, (list, tuple)) or self.motion_blur_prob == 0.0:
+            return video, target
+
+        new_video = []
+        new_target = []
+        idx = random.choice((0, 1))
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                random_prob = random.uniform(0, 1)
+                if random_prob < self.motion_blur_prob:
+                    image = self.motion_blur_func(image)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoCompressionAugment(object):
+    def __init__(self, max_compression=None):
+        self.max_compression = max_compression
+        if max_compression is None:
+            self.max_compression = 0.0
+        self.compression_func = ImageCompression(self.max_compression)
+
+    def __call__(self, video, target):
+        # Compression augmentation only applies for Siamese Training
+        if not isinstance(video, (list, tuple)) or self.max_compression == 0.0:
+            return video, target
+
+        idx = random.choice((0, 1))
+        new_video = []
+        new_target = []
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                image = self.compression_func(image)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
\ No newline at end of file
diff --git a/siam-mot/siammot/data/adapters/handler/data_filtering.py b/siam-mot/siammot/data/adapters/handler/data_filtering.py
new file mode 100644
index 0000000..9c51b8d
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/handler/data_filtering.py
@@ -0,0 +1,140 @@
+import numpy as np
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import AnnoEntity
+
+from siammot.utils.entity_utils import bbs_iou
+
+
+def build_data_filter_fn(dataset_key: str, *args, **kwargs):
+    """
+    Get dataset specific filter function list, if there is any
+    """
+    filter_fn = None
+    if dataset_key == 'CRP':
+        filter_fn = CRPFilter(*args, **kwargs)
+    elif dataset_key.startswith('MOT'):
+        filter_fn = MOTFilter(*args, **kwargs)
+    elif dataset_key == 'AOT':
+        filter_fn = AOTFilter(*args, **kwargs)
+    return filter_fn
+
+
+class BaseFilter:
+    def __init__(self):
+        pass
+
+    # the default filter does not filter any entity, which is technically doing nothing
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        raise False
+
+    def filter(self, entity:AnnoEntity, ignored_gt_entities=None):
+        return self._filter(entity, ignored_gt_entities)
+
+    def __call__(self, entities: [AnnoEntity], ignored_entities=None, meta_data=None):
+        """
+            Check each entity whether it is valid or should be filtered (ignored).
+            :param entities: A list of entities (for a single frame) to be evaluated
+            :param ignored_entities: A list of ignored entities or a binary mask indicating ignored regions
+            :param meta_data: The meta data for the frame (or video)
+            :return: A list of valid entities and a list of filtered (ignored) entities
+            """
+        valid_entities = []
+        filtered_entities = []
+
+        for entity in entities:
+            if self._filter(entity, ignored_entities):
+                filtered_entities.append(entity)
+            else:
+                valid_entities.append(entity)
+
+        return valid_entities, filtered_entities
+
+
+class CRPFilter(BaseFilter):
+    """
+        A class for filtering JTA dataset entities during evaluation
+        A gt entity will be filtered (ignored) if its id is -1 (negative)
+        A predicted entity will be filtered (ignored) if it is matched to a ignored ground truth entity
+        """
+    def __init__(self, iou_thresh=0.2, is_train=False):
+        """
+        :param iou_thresh: a predicted entity which overlaps with any ignored gt entity with at least
+         iou_thresh would be filtered
+        """
+        self.iou_thresh = iou_thresh
+
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        if ignored_gt_entities is None:
+            if entity.id < 0:
+                return True
+        else:
+            for entity_ in ignored_gt_entities:
+                if bbs_iou(entity, entity_) >= self.iou_thresh:
+                    return True
+        return False
+
+
+class MOTFilter(BaseFilter):
+    """
+    A class for filtering MOT dataset entities
+    A gt entity will be filtered (ignored) if its visibility ratio is very low
+    A predicted entity will be filtered (ignored) if it is matched to a ignored ground truth entity
+    """
+    def __init__(self, visibility_thresh=0.1, iou_thresh=0.5, is_train=False):
+        self.visibility_thresh = visibility_thresh
+        self.iou_thresh = iou_thresh
+        self.is_train = is_train
+
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        if ignored_gt_entities is None:
+            if self.is_train:
+                # any entity whose visibility is below the pre-defined
+                # threshold should be filtered out
+                # meanwhile, any entity whose class does not have label
+                # needs to be filtered
+                if entity.blob['visibility'] < self.visibility_thresh or \
+                        not any(k in ('person', '2', '7') for k in entity.labels):
+                    return True
+            else:
+                if 'person' not in entity.labels or int(entity.id) < 0:
+                    return True
+        else:
+            for entity_ in ignored_gt_entities:
+                if bbs_iou(entity, entity_) >= self.iou_thresh:
+                    return True
+            return False
+
+
+class AOTFilter(BaseFilter):
+    """
+    A class for filtering AOT entities
+    A gt entity will be filtered if it falls into one the following criterion
+      1. tracking id is not Helicopter1 or Airplane1
+      2. range distance is larger than 1200
+    """
+
+    def __init__(self, range_distance_thresh=1200, iou_thresh=0.2, is_train=False):
+        self.range_distance_thresh = range_distance_thresh
+        self.iou_thresh = iou_thresh
+        self.is_train = is_train
+
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        if ignored_gt_entities is None:
+            range_distance_m = np.inf
+            if 'range_distance_m' in entity.blob:
+                range_distance_m = entity.blob['range_distance_m']
+
+            labels = []
+            if entity.labels is not None:
+                labels = entity.labels
+
+            if ('intruder' not in labels) or \
+                    (range_distance_m >= self.range_distance_thresh):
+                return True
+        else:
+            for entity_ in ignored_gt_entities:
+                if entity_.bbox is not None:
+                    if bbs_iou(entity, entity_) >= self.iou_thresh:
+                        return True
+        return False
+
diff --git a/siam-mot/siammot/data/adapters/utils/data_utils.py b/siam-mot/siammot/data/adapters/utils/data_utils.py
new file mode 100644
index 0000000..2d2ce35
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/utils/data_utils.py
@@ -0,0 +1,62 @@
+import os
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset
+from pycocotools.coco import COCO
+
+from .dataset_info import dataset_maps
+
+
+def load_motion_anno(dataset_folder,
+                     anno_file,
+                     split_file,
+                     set=None,
+                     ):
+    """
+    Load GluonCVMotionDataset format annotations for downstream training / testing
+    """
+
+    dataset = GluonCVMotionDataset(anno_file,
+                                   root_path=dataset_folder,
+                                   split_file=split_file
+                                   )
+
+    if set == 'train':
+        dataset = list(dataset.train_samples)
+    elif set == 'val':
+        dataset = list(dataset.val_samples)
+    elif set == 'test':
+        dataset = list(dataset.test_samples)
+
+    return dataset
+
+
+def load_coco_anno(dataset_folder,
+                   anno_file):
+
+    dataset_anno_path = os.path.join(dataset_folder, anno_file)
+    dataset = COCO(dataset_anno_path)
+    return dataset
+
+
+def load_dataset_anno(cfg, dataset_key, set=None):
+    dataset_folder, anno_file, split_file, modality = dataset_maps[dataset_key]
+
+    dataset_info = dict()
+    dataset_info['modality'] = modality
+
+    dataset_folder = os.path.join(cfg.DATASETS.ROOT_DIR, dataset_folder)
+    if modality == 'video':
+        dataset = load_motion_anno(dataset_folder,
+                                   anno_file,
+                                   split_file,
+                                   set)
+    elif modality == 'image':
+        dataset = load_coco_anno(dataset_folder,
+                                 anno_file)
+        image_folder = os.path.join(dataset_folder, split_file)
+        dataset_info['image_folder'] = image_folder
+    else:
+        raise ValueError("dataset has to be video or image.")
+
+    return dataset, dataset_info
+
diff --git a/siam-mot/siammot/data/adapters/utils/dataset_info.py b/siam-mot/siammot/data/adapters/utils/dataset_info.py
new file mode 100644
index 0000000..36527d8
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/utils/dataset_info.py
@@ -0,0 +1,49 @@
+dataset_maps = dict()
+"""
+each item in the dataset maps are a list of the following info
+(
+dataset_folder, 
+annotation file name (video dataset) / path of annotation file (image dataset), 
+split file name (video dataset) / path of image folder (image dataset) , 
+modality
+)
+"""
+dataset_maps['TAO'] = ['TAO',
+                       'anno_person.json',
+                       'splits_person.json',
+                       'video']
+
+dataset_maps['CRP'] = ['caltech_roadside_pedestrians',
+                       'anno.json',
+                       'splits.json',
+                       'video']
+
+dataset_maps['MOT17_DPM'] = ['MOT17',
+                             'anno.json',
+                             'splits_DPM.json',
+                             'video']
+
+dataset_maps['MOT17'] = ['MOT17',
+                         'anno.json',
+                         'splits.json',
+                         'video']
+
+dataset_maps['AOT'] = ['airbone_object_tracking',
+                       'anno.json',
+                       'splits.json',
+                       'video']
+
+dataset_maps['COCO17_train'] = ['mscoco',
+                                'annotations/MSCOCO2017_train_person.json',
+                                'images/train2017',   # all raw images would be in dataset_root/mscoco/images/train2017
+                                'image']
+
+dataset_maps['crowdhuman_train_fbox'] = ['CrowdHuman',
+                                         'annotations/annotation_train_fbox.json',
+                                         'Images',  # all raw images would be in dataset_root/CrowdHuman/Images
+                                         'image']
+
+dataset_maps['crowdhuman_train_vbox'] = ['CrowdHuman',
+                                         'annotations/annotation_train_vbox.json',
+                                         'Images',
+                                         'image']
\ No newline at end of file
diff --git a/siam-mot/siammot/data/build_inference_data_loader.py b/siam-mot/siammot/data/build_inference_data_loader.py
new file mode 100644
index 0000000..970fca4
--- /dev/null
+++ b/siam-mot/siammot/data/build_inference_data_loader.py
@@ -0,0 +1,56 @@
+import torch
+import torch.utils.data as data
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import DataSample
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class InferenceVideoData(data.Dataset):
+    """
+    Split the video into small chunks (in an non-overlapping fashion) for inference
+    """
+
+    def __init__(self, video: DataSample, clip_len=1, transforms=None):
+        """
+        Construct a data loader for inference
+        :param video: a video stream in DataSample format
+        :param clip_len: the length of video clips
+        :param transforms: transform function for video pre-processing
+        """
+        self.video = video
+        self.video_reader = video.get_data_reader()
+        self.clip_len = clip_len
+        self.transforms = transforms
+        self.clip_idxs = list(range(0, len(self.video), self.clip_len))
+
+    def __getitem__(self, id):
+        video_clip = []
+        # this is needed for transformation
+        dummy_boxes = []
+        timestamps = []
+        start_idx = self.clip_idxs[id]
+        end_idx = min(len(self.video), start_idx + self.clip_len)
+        for frame_idx in range(start_idx, end_idx):
+            (im, timestamp, _) = self.video_reader[frame_idx]
+            dummy_bbox = torch.tensor([[0, 0, 1, 1]])
+            dummy_boxlist = BoxList(dummy_bbox, im.size, mode='xywh')
+
+            video_clip.append(im)
+            timestamps.append(torch.tensor(timestamp))
+            dummy_boxes.append(dummy_boxlist)
+
+        if self.transforms is not None:
+            video_clip, _ = self.transforms(video_clip, dummy_boxes)
+
+        return torch.stack(video_clip), start_idx, torch.stack(timestamps)
+
+    def __len__(self):
+        return len(self.clip_idxs)
+
+
+def build_video_loader(cfg, video: DataSample, transforms):
+    clip_len = cfg.INFERENCE.CLIP_LEN
+    videodata = InferenceVideoData(video, clip_len=clip_len, transforms=transforms)
+    videoloader = data.DataLoader(videodata, num_workers=4, batch_size=1, shuffle=False)
+
+    return videoloader
diff --git a/siam-mot/siammot/data/build_train_data_loader.py b/siam-mot/siammot/data/build_train_data_loader.py
new file mode 100644
index 0000000..2017413
--- /dev/null
+++ b/siam-mot/siammot/data/build_train_data_loader.py
@@ -0,0 +1,77 @@
+import torch.utils.data
+
+from maskrcnn_benchmark.utils.comm import get_world_size
+from maskrcnn_benchmark.data.build import make_data_sampler, make_batch_data_sampler
+from maskrcnn_benchmark.data.datasets.concat_dataset import ConcatDataset
+
+from .video_dataset import VideoDataset, VideoDatasetBatchCollator
+from .image_dataset import ImageDataset
+from .adapters.utils.data_utils import load_dataset_anno
+from .adapters.augmentation.build_augmentation import build_siam_augmentation
+from .adapters.handler.data_filtering import build_data_filter_fn
+
+
+def build_dataset(cfg):
+    """
+
+    """
+
+    dataset_list = cfg.DATASETS.TRAIN
+    if not isinstance(dataset_list, (list, tuple)):
+        raise RuntimeError(
+            "dataset_list should be a list of strings, got {}".format(dataset_list)
+        )
+
+    datasets = []
+    for dataset_key in dataset_list:
+        dataset_anno, dataset_info = load_dataset_anno(cfg, dataset_key)
+        modality = dataset_info['modality']
+        transforms = build_siam_augmentation(cfg, is_train=True, modality=modality)
+        data_filter_fn = build_data_filter_fn(dataset_key, is_train=True)
+
+        if modality == 'image':
+            assert 'image_folder' in dataset_info
+            _dataset = ImageDataset(dataset_anno,
+                                    dataset_info['image_folder'],
+                                    transforms=transforms,
+                                    frames_per_image=cfg.VIDEO.RANDOM_FRAMES_PER_CLIP,
+                                    amodal=cfg.INPUT.AMODAL)
+        else:
+            _dataset = VideoDataset(dataset_anno,
+                                    sampling_interval=cfg.VIDEO.TEMPORAL_SAMPLING,
+                                    clip_len=cfg.VIDEO.TEMPORAL_WINDOW,
+                                    transforms=transforms,
+                                    filter_fn=data_filter_fn,
+                                    frames_in_clip=cfg.VIDEO.RANDOM_FRAMES_PER_CLIP,
+                                    amodal=cfg.INPUT.AMODAL)
+        datasets.append(_dataset)
+
+    dataset = ConcatDataset(datasets)
+
+    return dataset
+
+
+def build_train_data_loader(cfg, is_distributed=False, start_iter=0):
+
+    num_gpus = get_world_size()
+
+    video_clips_per_batch = cfg.SOLVER.VIDEO_CLIPS_PER_BATCH
+    assert (
+        video_clips_per_batch % num_gpus == 0
+    ), "SOLVER.VIDEO_CLIPS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format(
+        video_clips_per_batch, num_gpus)
+
+    video_clips_per_gpu = video_clips_per_batch // num_gpus
+
+    dataset = build_dataset(cfg)
+    num_iters = cfg.SOLVER.MAX_ITER
+    sampler = make_data_sampler(dataset, True, is_distributed)
+    batch_sampler = make_batch_data_sampler(
+        dataset, sampler, [], video_clips_per_gpu, num_iters, start_iter
+    )
+
+    num_workers = cfg.DATALOADER.NUM_WORKERS
+    collator = VideoDatasetBatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
+    data_loader = torch.utils.data.DataLoader(dataset, num_workers=num_workers,
+                                              batch_sampler=batch_sampler, collate_fn=collator)
+    return data_loader
diff --git a/siam-mot/siammot/data/image_dataset.py b/siam-mot/siammot/data/image_dataset.py
new file mode 100644
index 0000000..806e8e6
--- /dev/null
+++ b/siam-mot/siammot/data/image_dataset.py
@@ -0,0 +1,232 @@
+import torch
+import os
+from tqdm import tqdm
+from PIL import Image
+
+import torch.utils.data as data
+from pycocotools.coco import COCO
+from gluoncv.utils.bbox import bbox_xywh_to_xyxy, bbox_clip_xyxy
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class ImageDataset(data.Dataset):
+    def __init__(self,
+                 dataset: COCO,
+                 image_dir,
+                 transforms=None,
+                 frames_per_image=1,
+                 amodal=False,
+                 skip_empty=True,
+                 min_object_area=0,
+                 use_crowd=False,
+                 include_bg=False,
+                 ):
+        """
+        :param dataset: the ingested dataset with COCO-format
+        :param transforms: image transformation
+        :param frames_per_image: how many image copies are generated from a single image
+        :param amodal: whether to use amodal ground truth (no image boundary clipping)
+        :param include_bg: whether to include the full background images during training
+        """
+
+        self.dataset = dataset
+        self.image_dir = image_dir
+        self.transforms = transforms
+        self.frames_per_image = frames_per_image
+
+        self._skip_empty = skip_empty
+        self._min_object_area = min_object_area
+        self._use_crowd = use_crowd
+        self._amodal = amodal
+        self._include_bg = include_bg
+        self._det_classes = [c['name'] for c in self.dataset.loadCats(self.dataset.getCatIds())]
+
+        # These are tha mapping table of COCO labels
+        self.json_category_id_to_contiguous_id = {
+            v: i+1 for i, v in enumerate(self.dataset.getCatIds())
+        }
+
+        self._labels, self._im_aspect_ratios, self._items, self._ids \
+            = self._dataset_preprocess()
+
+        self.id_to_img_map = {k: v for k, v in enumerate(self._ids)}
+
+    def __getitem__(self, index):
+        img_name = self._items[index]
+        img_path = os.path.join(self.image_dir, img_name)
+
+        img = Image.open(img_path).convert('RGB')
+        target = self._get_target(img, index)
+
+        # for tracking purposes, two frames are needed
+        # the pairs would go into random augmentation to generate fake motion
+        video_clip = [img for _ in range(self.frames_per_image)]
+        video_target = [target for _ in range(self.frames_per_image)]
+
+        if self.transforms is not None:
+            video_clip, video_target = self.transforms(video_clip, video_target)
+
+        return video_clip, video_target, img_name
+
+    def _get_target(self, img, index):
+
+        # a list of label (x1, y1, x2, y2, class_id, instance_id)
+        labels = self._labels[index]
+        if len(labels) == 0:
+            assert self._include_bg is True, "The image does not has ground truth"
+            bbox = torch.as_tensor(labels).reshape(-1, 4)
+            class_ids = torch.as_tensor(labels)
+            instance_ids = torch.as_tensor(labels)
+            empty_boxlist = BoxList(bbox, img.size, mode="xyxy")
+            empty_boxlist.add_field("labels", class_ids)
+            empty_boxlist.add_field("ids", instance_ids)
+            return empty_boxlist
+
+        labels = torch.as_tensor(labels).reshape(-1, 6)
+        boxes = labels[:, :4]
+        target = BoxList(boxes, img.size, mode="xyxy")
+
+        class_ids = labels[:, 4].clone().to(torch.int64)
+        target.add_field("labels", class_ids)
+
+        instance_ids = labels[:, -1].clone().to(torch.int64)
+        target.add_field("ids", instance_ids)
+
+        if not self._amodal:
+            target = target.clip_to_image(remove_empty=True)
+
+        return target
+
+    def _dataset_preprocess(self):
+        items = []
+        labels = []
+        ids = []
+        im_aspect_ratios = []
+        image_ids = sorted(self.dataset.getImgIds())
+        instance_id = 0
+        rm_redundant = 0
+        all_amodal = 0
+
+        for entry in tqdm(self.dataset.loadImgs(image_ids)):
+            label, num_instances, num_redundant, num_amodal\
+                = self._check_load_bbox(entry, instance_id)
+            if not label and not self._include_bg:
+                continue
+            instance_id += num_instances
+            rm_redundant += num_redundant
+            all_amodal += num_amodal
+            labels.append(label)
+            ids.append(entry['id'])
+            items.append(entry['file_name'])
+            im_aspect_ratios.append(float(entry['width']) / entry['height'])
+
+        print('{} / {} valid images...'.format(len(labels), len(image_ids)))
+        print('{} instances...'.format(instance_id))
+        print('{} redundant instances are removed...'.format(rm_redundant))
+        print('{} amodal instances...'.format(all_amodal))
+        return labels, im_aspect_ratios, items, ids
+
+    def _check_load_bbox(self, entry, instance_id):
+        """
+        Check and load ground-truth labels
+        """
+        entry_id = entry['id']
+        entry_id = [entry_id] if not isinstance(entry_id, (list, tuple)) else entry_id
+        ann_ids = self.dataset.getAnnIds(imgIds=entry_id, iscrowd=None)
+        objs = self.dataset.loadAnns(ann_ids)
+
+        # check valid bboxes
+        valid_objs = []
+        width = entry['width']
+        height = entry['height']
+        _instance_count = 0
+        _redudant_count = 0
+        _amodal_count = 0
+        unique_bbs = set()
+        for obj in objs:
+            if obj.get('ignore', 0) == 1:
+                continue
+            if not self._use_crowd and obj.get('iscrowd', 0):
+                continue
+            if self._amodal:
+                xmin, ymin, xmax, ymax = bbox_xywh_to_xyxy(obj['bbox'])
+                if xmin < 0 or ymin < 0 or xmax > width or ymax > height:
+                    _amodal_count += 1
+            else:
+                xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(obj['bbox']), width, height)
+
+            if (xmin, ymin, xmax, ymax) in unique_bbs:
+                _redudant_count += 1
+                continue
+
+            box_w = (xmax - xmin)
+            box_h = (ymax - ymin)
+            area = box_w * box_h
+            if area <= self._min_object_area:
+                continue
+
+            # require non-zero box area
+            if xmax > xmin and ymax > ymin:
+                unique_bbs.add((xmin, ymin, xmax, ymax))
+                contiguous_cid = self.json_category_id_to_contiguous_id[obj['category_id']]
+                valid_objs.append([xmin, ymin, xmax, ymax, contiguous_cid,
+                                   instance_id+_instance_count])
+                _instance_count += 1
+        if not valid_objs:
+            if not self._skip_empty:
+                # dummy invalid labels if no valid objects are found
+                valid_objs.append([-1, -1, -1, -1, -1, -1])
+        return valid_objs, _instance_count, _redudant_count, _amodal_count
+
+    def __len__(self):
+        return len(self._items)
+
+    def get_img_info(self, index):
+        img_id = self.id_to_img_map[index]
+        img_data = self.dataset.imgs[img_id]
+        return img_data
+
+    @property
+    def classes(self):
+        return self._det_classes
+
+    def get_im_aspect_ratio(self):
+        return self._im_aspect_ratios
+
+
+if __name__ == "__main__":
+
+    from siammot.configs.defaults import cfg
+    from siammot.data.video_dataset import VideoDatasetBatchCollator
+    from siammot.data.adapters.utils.data_utils import load_dataset_anno
+    from siammot.data.adapters.augmentation.build_augmentation import build_siam_augmentation
+
+    torch.manual_seed(0)
+
+    dataset_anno, dataset_info = load_dataset_anno('COCO17_train')
+    collator = VideoDatasetBatchCollator()
+    transforms = build_siam_augmentation(cfg, modality=dataset_info['modality'])
+
+    dataset = ImageDataset(dataset_anno,
+                           dataset_info['image_folder'],
+                           frames_per_image=2,
+                           transforms=transforms,
+                           amodal=True)
+
+    batch_size = 16
+    sampler = torch.utils.data.sampler.RandomSampler(dataset)
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, batch_size, drop_last=False)
+    dataloader = data.DataLoader(dataset,
+                                 num_workers=4,
+                                 batch_sampler=batch_sampler,
+                                 collate_fn=collator
+                                 )
+    import time
+    tic = time.time()
+    for iteration, (image, target, image_ids) in enumerate(dataloader):
+        data_time = time.time() - tic
+        print("Data loading time: {}".format(data_time))
+        tic = time.time()
+        print(image_ids)
\ No newline at end of file
diff --git a/siam-mot/siammot/data/ingestion/ingest_mot.py b/siam-mot/siammot/data/ingestion/ingest_mot.py
new file mode 100644
index 0000000..cd10128
--- /dev/null
+++ b/siam-mot/siammot/data/ingestion/ingest_mot.py
@@ -0,0 +1,197 @@
+import argparse
+import csv
+import configparser
+import datetime
+import glob
+import os
+
+from PIL import Image
+from pathlib import Path
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, DataSample, AnnoEntity, FieldNames, SplitNames
+from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import process_dataset_splits
+
+# From paper, see table 5 and 6: https://arxiv.org/pdf/1603.00831.pdf
+MOT_LABEL_MAP = {
+    1: "Pedestrian",
+    2: "Person on vehicle",
+    3: "Car",
+    4: "Bicycle",
+    5: "Motorbike",
+    6: "Non motorized vehicle",
+    7: "Static person",
+    8: "Distractor",
+    9: "Occluder",
+    10: "Occluder on the ground",
+    11: "Occluder full",
+    12: "Reflection",
+}
+
+DET_OPTIONS = {"SDP", "FRCNN", "DPM"}
+
+
+def sample_from_mot_csv(csv_path, fps, sample=None, mot17=True, has_gt=False):
+    if sample is None:
+        id_ = Path(csv_path).stem
+        sample = DataSample(id_)
+    else:
+        sample = sample.get_copy_without_entities()
+    with open(csv_path, newline='') as f:
+        reader = csv.reader(f, delimiter=',')
+
+        def coord(x):
+            return round(float(x))
+
+        for row in reader:
+            frame_num = int(row[0])
+            obj_id = row[1]
+            x = coord(row[2])
+            y = coord(row[3])
+            w = coord(row[4])
+            h = coord(row[5])
+            conf = float(row[6])
+            # If not mot17 the last 3 are 3D coords which are usually -1
+            # (see pg. 9 https://arxiv.org/pdf/1504.01942.pdf)
+            if has_gt and mot17:
+                label = int(row[7])
+                visibility = float(row[8])
+            else:
+                label = 1
+                visibility = 1
+
+            label_text = MOT_LABEL_MAP[label]
+
+            # NOTE: Actually all classes that aren't Pedestrian have confidence 0 and so should be ingested
+            # but are ignored at evaluation time
+            # i.e. (label != 1 and conf) is never true
+            assert not (label != 1 and conf)
+            has_person_label = label_text in ("Pedestrian")
+
+            time_ms = int((frame_num - 1) / fps * 1000)
+            entity = AnnoEntity(time=time_ms, id=obj_id)
+            entity.bbox = [x, y, w, h]
+            blob = {
+                "frame_csv": frame_num,
+                "frame_idx": frame_num - 1,
+                "visibility": visibility
+            }
+            entity.labels = {}
+            # entity.labels["person"] = 1
+            if has_person_label:
+                entity.labels["person"] = 1
+            else:
+                entity.labels[str(label)] = 1
+            entity.labels["vis"] = visibility
+
+            entity.confidence = conf
+            entity.blob = blob
+
+            sample.add_entity(entity)
+    return sample
+
+
+def main(args, description="Initial ingestion", det_options=None, mot17=True):
+    if mot17:
+        if det_options is not None and not all(x in DET_OPTIONS for x in det_options):
+            raise ValueError("Det options were {} but must be only: {}".format(det_options, DET_OPTIONS))
+        if det_options is None:
+            det_options = DET_OPTIONS
+    else:
+        print("Ingesting MOT15, ignoring det options {}".format(det_options))
+        det_options = [""]
+
+    dataset_path = args.dataset_path
+    out_filename = args.anno_name
+
+    out_dataset = GluonCVMotionDataset(out_filename, dataset_path, load_anno=False)
+    metadata = {
+        FieldNames.DESCRIPTION: description,
+        FieldNames.DATE_MODIFIED: str(datetime.datetime.now()),
+    }
+    out_dataset.metadata = metadata
+
+    splits = {
+        "train": os.path.join(out_dataset.data_root_path, "train"),
+        "test": os.path.join(out_dataset.data_root_path, "test"), # No gt for MOT test
+    }
+
+    for det_option in det_options:
+        for split_name, split_path in splits.items():
+            subdirs = glob.glob(os.path.join(split_path, "*" + det_option))
+            for i, subdir in enumerate(subdirs):
+                vid_id = os.path.basename(subdir)
+                vid_path = os.path.join(split_path, subdir)
+
+                sample = DataSample(vid_id)
+
+                if mot17:
+                    info_path = os.path.join(vid_path, "seqinfo.ini")
+                    config = configparser.ConfigParser()
+                    config.read(info_path)
+                    seq_conf = config["Sequence"]
+                    fps = float(seq_conf['frameRate'])
+                    num_frames = int(seq_conf['seqLength'])
+                    width = int(seq_conf['imWidth'])
+                    height = int(seq_conf['imHeight'])
+                else:
+                    # Assume 30 fps
+                    fps = 30
+                    im_paths = glob.glob(os.path.join(vid_path, "img1", "*.jpg"))
+                    num_frames = len(im_paths)
+                    im_example = Image.open(im_paths[0])
+                    width = im_example.width
+                    height = im_example.height
+
+                rel_base_dir = vid_path.replace(out_dataset.data_root_path, "").lstrip(os.path.sep)
+                rel_base_dir = os.path.join(rel_base_dir, "img1")
+                metadata = {
+                    FieldNames.DATA_PATH: rel_base_dir,
+                    FieldNames.FPS: fps,
+                    FieldNames.NUM_FRAMES: num_frames,
+                    FieldNames.RESOLUTION: {"width": width, "height": height},
+                }
+                sample.metadata = metadata
+
+                gt_path = os.path.join(vid_path, "gt/gt.txt")
+                det_path = os.path.join(vid_path, "det/det.txt")
+                has_gt = os.path.exists(gt_path)
+                anno_path = gt_path if has_gt else det_path
+
+                sample = sample_from_mot_csv(anno_path, fps, sample, mot17, has_gt)
+
+                out_dataset.add_sample(sample)
+
+                print("Done {} sample {}/{}, {}".format(split_name, i+1, len(subdirs), vid_id))
+
+    out_dataset.dump()
+
+    return out_dataset
+
+
+def write_data_split(args, dataset):
+    if dataset is None:
+        dataset = GluonCVMotionDataset(args.anno_name, args.dataset_path)
+
+    def split_func(sample):
+        data_path = sample.data_relative_path
+        if data_path.startswith("train"):
+            return SplitNames.TRAIN
+        elif data_path.startswith("test"):
+            return SplitNames.TEST
+
+        raise Exception("Shouldn't happen")
+
+    process_dataset_splits(dataset, split_func, save=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Ingest mot dataset')
+    parser.add_argument('--dataset_path', default="",
+                        help="The path of dataset folder")
+    parser.add_argument('--anno_name', default="anno.json",
+                        help="The file name (with json) of ingested annotation file")
+    args = parser.parse_args()
+
+    mot17 = "MOT17" in args.dataset_path
+    dataset = main(args, mot17=mot17)
+    write_data_split(args, dataset)
diff --git a/siam-mot/siammot/data/ingestion/ingest_prim_air.py b/siam-mot/siammot/data/ingestion/ingest_prim_air.py
new file mode 100644
index 0000000..b973d1d
--- /dev/null
+++ b/siam-mot/siammot/data/ingestion/ingest_prim_air.py
@@ -0,0 +1,127 @@
+import argparse
+import copy
+import datetime
+import fire
+import string
+import tqdm
+import os
+from pathlib import Path
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, FieldNames, SplitNames
+from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import process_dataset_splits
+from gluoncv.torch.data.gluoncv_motion_dataset.utils.serialization_utils import save_json
+
+
+def ingest_dataset(args, renumber_ids=True):
+    """
+
+    :param args: Input arguments
+    :param renumber_ids: rename track identities to integers
+    """
+    dataset = GluonCVMotionDataset(args.anno_name, args.dataset_path, load_anno=False)
+    dataset.metadata = {
+        FieldNames.DESCRIPTION: "Initial ingestion",
+        FieldNames.DATE_MODIFIED: str(datetime.datetime.now()),
+    }
+    #raw_anno_paths = sorted(Path(dataset.data_root_path).glob("groundtruth.json"))
+    raw_anno_paths = sorted(Path('/home/ubuntu/airborne-detection-starter-kit/data/').glob("groundtruth.json"))
+    
+    for raw_anno_path in tqdm.tqdm(raw_anno_paths):
+        # Setting the dataset and samples to None here looks pointless but it allows the memory to be freed, otherwise
+        # on subsequent iterations it can actually run out of memory as it loads a new dataset while keeping the
+        # previous one still in memory (happened on c5.xlarge 8GB RAM)
+        raw_dataset = None
+        samples = None
+        # raw_sample and sample have references back to the dataset so have to unset these too
+        raw_sample = sample = None
+        raw_dataset = GluonCVMotionDataset(raw_anno_path)
+        raw_dataset.__version__ = 1
+        set_dir = raw_anno_path.parent.parent
+        images_root_path = Path(dataset.data_root_path) # set_dir / "Images"
+
+        samples = sorted(raw_dataset.samples)
+        with open ('/home/ubuntu/siam-mot/data/all_flights_val.txt', 'r') as f:
+            all_flights = f.readlines()
+        all_flights = [flight.rstrip() for flight in all_flights]
+
+        for raw_id, raw_sample in tqdm.tqdm(samples):
+            if raw_id not in all_flights[200:]:
+                continue
+            data_path = images_root_path /raw_id 
+            data_rel_path = str(data_path.relative_to(dataset.data_root_path))
+            new_id = data_rel_path
+            first_img = sorted(data_path.glob("*.png"))[0]
+            first_timestamp = int(first_img.name.split(raw_id)[0])
+            sample = raw_sample.get_copy_without_entities(new_id=new_id)
+            sample.metadata["orig_path"] = raw_sample.data_relative_path
+            sample.data_relative_path = data_rel_path
+            unique_ids = {}
+
+            first_frame = None
+            for raw_entity in raw_sample.entities:
+                entity = copy.deepcopy(raw_entity)
+                orig_frame = entity.blob.pop("frame")
+                orig_time = entity.time
+                if first_frame is None:
+                    assert raw_entity.time == first_timestamp
+                    first_frame = orig_frame
+                rel_frame = orig_frame - first_frame
+                # rel_ts = raw_entity.time - first_timestamp
+                # assert rel_ts >= 0
+                # rel_ts_msec = rel_ts / 1e6
+                # ts_msec_round = int(round(rel_ts_msec / sample.period) * sample.period)
+                # print(f"frame: {raw_entity.blob.get('frame')} ts_msec: {rel_ts_msec} ts_round {ts_msec_round}")
+                # print()
+                # assert abs(rel_ts_msec - ts_msec_round) < sample.period / 10
+                # entity.time = ts_msec_round
+
+                entity.time = round(rel_frame / sample.fps * 1000)
+                if entity.id:
+                    obj_type = entity.id.rstrip(string.digits).lower()
+                    entity.labels[obj_type] = 1
+                    if entity.id.lower() in ("airplane1", "helicopter1"):
+                        entity.labels["intruder"] = 1
+                    entity.blob["orig_id"] = entity.id
+                    if renumber_ids:
+                        entity.id = unique_ids.setdefault(entity.id, len(unique_ids))
+                entity.blob[FieldNames.FRAME_IDX] = rel_frame
+                entity.blob["orig_frame"] = orig_frame
+                entity.blob["orig_time"] = orig_time
+                if entity.labels and "miss_distance_class" in entity.labels:
+                    entity.blob["miss_distance_class"] = entity.labels.pop("miss_distance_class")
+                if "range_distance_m" in entity.blob:
+                    entity.blob["range_distance_m"] = round(entity.blob["range_distance_m"], 1)
+                sample.add_entity(entity)
+
+            # break
+            dataset.add_sample(sample, dump_directly=True)
+
+        dataset.dump()
+
+    return dataset
+
+
+def write_split(dataset):
+    def split_func(sample):
+        # data_path = sample.data_relative_path
+        orig_path = sample.metadata['orig_path']
+        if orig_path.startswith("train"):
+            return SplitNames.TRAIN
+        elif orig_path.startswith("val"):
+            return SplitNames.VAL
+
+        raise Exception("Shouldn't happen")
+
+    process_dataset_splits(dataset, split_func, save=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Ingest Prime Air dataset')
+    parser.add_argument('--dataset_path', default="/home/ubuntu/airborne-detection-starter-kit/data/val/")
+                        #description="The path of dataset folder")
+    parser.add_argument('--anno_name', default="anno.json")
+                        #description="The file name (with json) of ingested annotation file")
+    args = parser.parse_args()
+
+    dataset = ingest_dataset(args, renumber_ids=True)
+    write_split(dataset)
diff --git a/siam-mot/siammot/data/video_dataset.py b/siam-mot/siammot/data/video_dataset.py
new file mode 100644
index 0000000..a9c3f6a
--- /dev/null
+++ b/siam-mot/siammot/data/video_dataset.py
@@ -0,0 +1,195 @@
+import random
+import torch
+import itertools
+import torch.utils.data as data
+from tqdm import tqdm
+from collections import defaultdict
+from PIL.Image import Image
+
+from maskrcnn_benchmark.structures.image_list import to_image_list
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, AnnoEntity
+
+
+class VideoDataset(data.Dataset):
+
+    def __init__(self, dataset: GluonCVMotionDataset, sampling_interval=250, clip_len=1000,
+                 is_train=True, frames_in_clip=2, transforms=None, filter_fn=None,
+                 amodal=False):
+        """
+        :param dataset: the ingested dataset with GluonCVMotionDataset
+        :param sampling_interval: the temporal stride (in ms) of sliding window
+        :param clip_len: the temporal length (in ms) of video clips
+        :param is_train: a boolean flag indicating whether it is training
+        :param frames_in_clip: the number of frames sampled in a video clip (for a training example)
+        :param transforms: frame-level transformation before they are fed into neural networks
+        :param filter_fn: a callable function to filter entities
+        :param amodal: whether to clip the bounding box beyond image boundary
+        """
+
+        if dataset is None:
+            raise Exception('dataset should not be None. Call GluonCVMotionDataset to construct dataset first.')
+
+        assert is_train is True, "The dataset class only supports training"
+        assert (2 >= frames_in_clip > 0), "frames_in_clip has to be 1 or 2"
+
+        self.data = dict(dataset.train_samples)
+
+        self.clip_len = clip_len
+        self.transforms = transforms
+        self.filter_fn = filter_fn
+        self.frames_in_clip = min(clip_len, frames_in_clip)
+
+        # Process dataset to get all valid video clips
+        self.clips = self.get_video_clips(sampling_interval_ms=sampling_interval)
+        self.amodal = amodal
+
+    def __getitem__(self, item_id):
+
+        video = []
+        target = []
+
+        (sample_id, clip_frame_ids) = self.clips[item_id]
+        video_info = self.data[sample_id]
+        video_reader = video_info.get_data_reader()
+
+        # Randomly sampling self.frames_in_clip frames
+        # And keep their relative temporal order
+        rand_idxs = sorted(random.sample(clip_frame_ids, self.frames_in_clip))
+        for frame_idx in rand_idxs:
+            im = video_reader[frame_idx][0]
+            entities = video_info.get_entities_for_frame_num(frame_idx)
+            if self.filter_fn is not None:
+                entities, _ = self.filter_fn(entities, meta_data=video_info.metadata)
+            boxes = self.entity2target(im, entities)
+
+            video.append(im)
+            target.append(boxes)
+
+        # Video clip-level augmentation
+        if self.transforms is not None:
+            video, target = self.transforms(video, target)
+
+        return video, target, sample_id
+
+    def __len__(self):
+        return len(self.clips)
+
+    def get_video_clips(self, sampling_interval_ms=250):
+        """
+        Process the long videos to a small video chunk (with self.clip_len seconds)
+        Video clips are generated in a temporal sliding window fashion
+        """
+        video_clips = []
+        for (sample_id, sample) in tqdm(self.data.items()):
+            frame_idxs_with_anno = sample.get_non_empty_frames(self.filter_fn)
+            if len(frame_idxs_with_anno) == 0:
+                continue
+            # The video clip may not be temporally continuous
+            start_frame = min(frame_idxs_with_anno)
+            end_frame = max(frame_idxs_with_anno)
+            # make sure that the video clip has at least two frames
+            clip_len_in_frames = max(self.frames_in_clip, int(self.clip_len / 1000. * sample.fps))
+            sampling_interval = int(sampling_interval_ms / 1000. * sample.fps)
+            for idx in range(start_frame, end_frame, sampling_interval):
+                clip_frame_ids = []
+                # only include frames with annotation within the video clip
+                for frame_idx in range(idx, idx + clip_len_in_frames):
+                    if frame_idx in frame_idxs_with_anno:
+                        clip_frame_ids.append(frame_idx)
+                # Only include video clips that have at least self.frames_in_clip annotating frames
+                if len(clip_frame_ids) >= self.frames_in_clip:
+                    video_clips.append((sample_id, clip_frame_ids))
+
+        return video_clips
+
+    def entity2target(self, im: Image, entities: [AnnoEntity]):
+        """
+        Wrap up the entity to maskrcnn-benchmark compatible format - BoxList
+        """
+        boxes = [entity.bbox for entity in entities]
+        ids = [int(entity.id) for entity in entities]
+        # we only consider person tracking for now,
+        # thus all the labels are 1,
+        # reserve category 0 for background during training
+        int_labels = [1 for _ in entities]
+
+        boxes = torch.as_tensor(boxes).reshape(-1, 4)
+        boxes = BoxList(boxes, im.size, mode='xywh').convert('xyxy')
+        if not self.amodal:
+            boxes = boxes.clip_to_image(remove_empty=False)
+        boxes.add_field('labels', torch.as_tensor(int_labels, dtype=torch.int64))
+        boxes.add_field('ids', torch.as_tensor(ids, dtype=torch.int64))
+
+        return boxes
+
+
+class VideoDatasetBatchCollator(object):
+    """
+    From a list of samples from the dataset,
+    returns the batched images and targets.
+    This should be passed to the DataLoader
+    """
+
+    def __init__(self, size_divisible=0):
+        self.size_divisible = size_divisible
+
+    def __call__(self, batch):
+        transposed_batch = list(zip(*batch))
+        image_batch = list(itertools.chain(*transposed_batch[0]))
+        image_batch = to_image_list(image_batch, self.size_divisible)
+
+        # to make sure that the id of each instance
+        # are unique across the whole batch
+        targets = transposed_batch[1]
+        video_ids = transposed_batch[2]
+        uid = 0
+        video_id_map = defaultdict(dict)
+        for targets_per_video, video_id in zip(targets, video_ids):
+            for targets_per_video_frame in targets_per_video:
+                if targets_per_video_frame.has_field('ids'):
+                    _ids = targets_per_video_frame.get_field('ids')
+                    _uids = _ids.clone()
+                    for i in range(len(_ids)):
+                        _id = _ids[i].item()
+                        if _id not in video_id_map[video_id]:
+                            video_id_map[video_id][_id] = uid
+                            uid += 1
+                        _uids[i] = video_id_map[video_id][_id]
+                    targets_per_video_frame.extra_fields['ids'] = _uids
+
+        targets = list(itertools.chain(*targets))
+
+        return image_batch, targets, video_ids
+
+
+if __name__ == "__main__":
+
+    from siammot.data.adapters.utils.data_utils import load_dataset_anno
+
+    torch.manual_seed(0)
+
+    dataset_anno, dataset_info = load_dataset_anno('MOT17')
+    collator = VideoDatasetBatchCollator()
+
+    dataset = VideoDataset(dataset_anno,
+                           frames_in_clip=2,
+                           amodal=True)
+
+    batch_size = 16
+    sampler = torch.utils.data.sampler.RandomSampler(dataset)
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, batch_size, drop_last=False)
+    dataloader = data.DataLoader(dataset,
+                                 num_workers=4,
+                                 batch_sampler=batch_sampler,
+                                 collate_fn=collator
+                                 )
+    import time
+    tic = time.time()
+    for iteration, (image, target, image_ids) in enumerate(dataloader):
+        data_time = time.time() - tic
+        print("Data loading time: {}".format(data_time))
+        tic = time.time()
+        print(image_ids)
-- 
GitLab