From 179451dfc2ed154b2bef67ef4e9c962423e05775 Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Thu, 10 Sep 2020 15:15:03 +0200 Subject: [PATCH] Divided into 3 separate scripts: deploy_stacks.py, load_data.py, run_workflow.py --- .gitignore | 114 ++- .../docker_compose_generator.cpython-36.pyc | Bin 2107 -> 0 bytes .../docker_compose_generator.cpython-38.pyc | Bin 2644 -> 0 bytes __pycache__/metadata_generator.cpython-36.pyc | Bin 585 -> 0 bytes __pycache__/metadata_generator.cpython-38.pyc | Bin 908 -> 0 bytes __pycache__/table_parser.cpython-36.pyc | Bin 2676 -> 0 bytes __pycache__/table_parser.cpython-38.pyc | Bin 2694 -> 0 bytes create_input_instance.py | 108 ++- deploy_stacks.py | 760 +++--------------- docker_compose_generator.py | 18 +- ...ple_input.json => json_example_input.json} | 0 examples/yml_example_input.yml | 10 +- ...ata_libraries.py => galaxy_data_libs_SI.py | 0 load_data.py | 253 +++++- run_workflow.py | 462 +++++++++++ table_parser.py | 159 ++-- templates/compose-template.yml | 2 +- templates/stack-organism.yml | 4 +- {ext_scripts => utils}/__init__.py | 0 {ext_scripts => utils}/blastdb.py | 0 .../common-stringSubsitute.py | 72 +- .../phaeoexplorer-change_pep_fasta_header.sh | 32 +- ...explorer-change_transcript_fasta_header.sh | 0 ...orer-change_transcript_fasta_header.sh.bak | 14 +- 24 files changed, 1100 insertions(+), 908 deletions(-) delete mode 100644 __pycache__/docker_compose_generator.cpython-36.pyc delete mode 100644 __pycache__/docker_compose_generator.cpython-38.pyc delete mode 100644 __pycache__/metadata_generator.cpython-36.pyc delete mode 100644 __pycache__/metadata_generator.cpython-38.pyc delete mode 100644 __pycache__/table_parser.cpython-36.pyc delete mode 100644 __pycache__/table_parser.cpython-38.pyc rename examples/{example_input.json => json_example_input.json} (100%) rename setup_data_libraries.py => galaxy_data_libs_SI.py (100%) rename {ext_scripts => utils}/__init__.py (100%) rename {ext_scripts => utils}/blastdb.py (100%) rename {ext_scripts => utils}/common-stringSubsitute.py (97%) rename {ext_scripts => utils}/phaeoexplorer-change_pep_fasta_header.sh (96%) rename {ext_scripts => utils}/phaeoexplorer-change_transcript_fasta_header.sh (100%) rename {ext_scripts => utils}/phaeoexplorer-change_transcript_fasta_header.sh.bak (97%) diff --git a/.gitignore b/.gitignore index 07e2dd1..17e708b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,107 @@ -__pycache__ -.idea -phaeoexplorer_test.json -example.json -example.xlsx -*.bak -undaria_pinnatifida +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# IDE stuff +.idea \ No newline at end of file diff --git a/__pycache__/docker_compose_generator.cpython-36.pyc b/__pycache__/docker_compose_generator.cpython-36.pyc deleted file mode 100644 index f0bfefaa1f33103cb7eb2bf92aff23cbcba9ed3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2107 zcmZ`)&2HQ_5GEz9R^GoP&R-kUiCPpz7RXvf>O)WzfrA9_%}CpW+XVuF7P+$4T4_Ua z?OIs(vbhFDZ$0&%9{LD9^xP-lwWq#9PMx84olTHP;0!rK4LLL4jK1o2Tfbj-<5yo2 z@-JC?e3<XTP=5pAgwvc1YfgvMuAQN?N4Ue?7lgYDXXpx#d&iFaOVDwHH@N?j4t?Q| zo4m;b5%3M(dg)POC2igTNq0@s<Cj2kc};SKZ-V6Ng=AwvhArU&ZHv~pBieBXco%pV zcu)R`6jnXpmw>|<UtSOjcmJOBul)ypko^Sd>TzJQ3qySlL=g0A=y1xN7i8#imwUiH z^aOZ=H^E9^ZagWIV<8`xlWD2Mo+yN5x|9h86ZjxUyN_Y0&wy|;Cvi=|KFl3UT}wSq zz$)GEH;gwaxiF1X6f<R-YATXUC=+NgndVFj<CindGufxcQz9R|B60()zy2{gDkmZu z$@@F^cOsP?Ml2VHOseSVS#reW<f$yjBGD>JQ#R8jvN$gpk2%vU;?|ousoja;^xB{8 z>6r=QI4d$8$9K?|77#-H<y+l(e;l?~-&_R5gU@;}Y(V~j$()QK9LIsYt*Lg#kWfx< zk_E*u6@O^Y>Yau2%WvmYHx}gF`RV65eL|j{oV#;p?9W|*l&fL?I2Z#cxOWq6w$R4H z0bQfkasM58b14trk#DToYJI$BYe6Y_{@?&5?YYC-FJXNEcmI}rM}7bfsq5@lThS^< zY8lSKGMw9IliW~gx;CltDyKBAx&WM+3YU`CG+aue3rRE#(&a)TrXcUlq%5>3biX69 zI*qHeG)}3Ep9-Cva^sa#Q5Y}Jlr}z_!h)No6o8*ZNX&z2<mIW5vVrrBQ<-TI!yWsc z3{c`{#VE^#yn+ib16V+e)dcfzf+ax27qC*8Rzh3!TB8TBXgMeJgRoeAmtbw+gV#Zm z78Va+qA|=Ap};Zuw;}B}Y}$J^?ath_>G<g2-4wX{)z$X~{<QDP>tI<nftZGMcj;>F zViQWrI{XRldfQcb3uHSm6vmp+P4Ke{KLGlRNbdK2ESrK&L>!w|979pha-4VK`1y?G zwMMqVq})P+$E+P~FM)B97%1Ff&kvlS8T70S?N3-%#Bud2pr|g_upEVTDX(KM9F;QM z+j|sZ;ZLB6@Gt`erO>gmvH)!kaL9@@)JGz`0CC7pSeCQHT!g#fk=9f7brhwU2GMp_ z$c$}|WW)}o&}_ydNNkBG$6QRx2vFMQLM1Yr;t`cINUP>x0Gnp2YG=iCrbD~vJI2#z zQ(?R#kxyZtsx}6LCE%-Wy>qp62D)ku2CExZ4{P;m{XWm{E=w?k2q?{x<Nm#{w0B}T zoK{xBU{E*IYIJ%eB=}#3HjOqIpu(%Z9n$bF7b7;ywZhE_zh~o$y)?8Xrov1^fZ@oL z6`7i-5c)N=-cmv*@At3BtB_6;FquxFKMT{$%QS`X+qBA0QQ3PW)7b=S{3-5`#<x4A z=`hY?_|WR)CT?w66$+K{kPbUMdDFmcz1u>!&BE3|jkq^BzW|#FzNvxw5L=D?Z3PtG VTfMD-cHB)sKfZzEqZDc#>Hq#WJ*EHv diff --git a/__pycache__/docker_compose_generator.cpython-38.pyc b/__pycache__/docker_compose_generator.cpython-38.pyc deleted file mode 100644 index 1201041cefa429a4343e2513ccd1c57b142f50b8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2644 zcmcgu&2HQ_5GEz<uDpM7?8I(@Ce@)oSs=22Anl<DiUbMZo13;MS}afqvdEQXYo!gz zjbXt#ImZS?fj)qJDbQ2jrAHo9pg><ChoUo-wBEW7&{MVGkTWD_IKyv-`}JVZCGg$; z_4DwrRYHD8X8CIa^AMW+0|+CGW@KFTbW9Czjje6MEM~tX%s#QkHg}kFV2Rh9?zdQr zwO`S3o45BntixRHvK7{S<xpZIJ=O=wU`aA$S3t75B)Q7gE=bndHIQ6iTC>46LGn%` zIbAs;<1V*P$hgP5`+eRE`_K=dA3#48zaa%~2>li4p`EUt5eoi&MMgLN1YgNcjCA?4 zfO!Z_z7K>G^nYwI%B+`UY%`lV&^zcc^exr_O|IT}Qp5*bJT9iQLh@~%a}lXR#1vFC zAC6J?V`%aY5JrwkSW&P6W6Mz6P{+^~BcPw|jN00n7L4mw!t=S*9XaE1%B8+qh$PBW zISmz`&ay~xy{bgSC+Wetq+QGvOT~z4NAhg)3lWHhH<MtmnDSsE?%lt4KalBe5M_Kf z5;Ayt6z@f1`cxGAJXSJDl4!09>_Qx5MZ^q)gOEik3YhUfjH>{IaLOtITeBnWhGCkg zDh%I4&-*|KZ7+W1dzZ&<tJ(FBAhuvL9^M4;dkE<<*@qAwxZ<{=$~v{r;Bq&~3B}On zKP$(~TPM~JKfR!;bwXZP$MgyL?)eM**g9<=+ov67D_FhnBAl4Bi5k18;mm@$RyD`k z7v!B<?p~0uEa~drSkiSyDS7q@=<Xd`toI7IZ^*~w3-TrT7FYF+53m2_L%;T+&z5~~ z7Ct!tyAN-S56(p&3{d+!<*lFrq70;p;sd`y=+@CR(^TkzfpFL$n1<y55@s&FTJnZ^ zwIo<b0_`R>BQTEu%AL5#70=aZAh3FATPmTgLh5$HReZ>_Q_Og-oh+3~x1$*_Om_r_ zB#F6ja7!yI4!IDBDZO$iQpH0!qv)W4mRLb?JIyC)21(U;0T4H4QE|@4SDIw2#U`^V z(!>33oeBIS;N-hN2<_5q))4UTT8eI1%iLRYq0tg!0ck$;0S{?%;XVu$B8ejY4O(v2 zSRWd!TgSG+;Kn)b%$?ag<$9gIex1J6O&yMGu?bp*3q-d}d`6B5i&#fd2g(WtM_^uG za9p6P51>iB975NqOH1!9nZ5`;);Xvac{Jr=sJmeZHE*8bxF3ek=22G72&^H(LxN_O zOk-&f7<hq!x&_VHcP-a-uNxWapGIjOhUJd{yQ)fg#l)*B*m4NGNg=%L?MEKgyeX7c z=B04=36%D<$YC}E{G@r}sXgv3Q13<0qcn?lGwwa~_LQ2*PlF&y6^ORdT%^&~L<G^U z;3}F!73Fz>mP5v;MF6nwF)m|~&d@|@`AOOFeONS?WiQQVbLAOkWNAko&A4{<cs7G| zvRv`~Iz!4qwX$I?zbM!JMIQRiQOoUWT?6ff50}N_PK^x@Vq)@obk{4)DWeSRt8&fv z>s&ZDVENOs>-!jQA0D9c;Bb!%h*ja4D>s1y5TRT3-b+n0K8fa;lDIKucg=m^)8m<3 zmR_phVm!3sffabM3UM@oL$PwW<Tf5)_VxoTqHqZi?)<kCoU2%zVsuqp2NY^I63Gmn zR<1i)ktBc`gGPb>G4$|}Naj<hWlwR1&~39)=zhdl2pi3uxQ0u+W(tK;J4nZU7N0-2 zrmj^5&26sP<ZqQNCOytcRiPeKN%k4mRrw(h7jo6TIsCw-eR~ZuxB0mi)GYW_^lxw3 B-W&h` diff --git a/__pycache__/metadata_generator.cpython-36.pyc b/__pycache__/metadata_generator.cpython-36.pyc deleted file mode 100644 index 19eb173b57afdd0bdc0683e4ed77949e298cd055..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 585 zcmZWlO-lnY5KXfE*j5XQ3gXdQJ#?Y=BqBwjJy{C9?Iom}bi1;<smbo4(wqG={-s<! z`4>Dnvu&Z^gn2U|d2c@MJDv9P!!;jygnW}DgF6|5+dBYBBuz=Ui!`LworP>nB$M8j zNbi${o}@F<t^L4%GKmlm#vmDj+e-jNkRLKhCEF5UdB}3-uc{DI6k@D$WkjJ(gfuja z=r7>*7GOg+ghMlcAO*-Oga#n5NdkgwGV0b{GZRTJ6XWW!%1i5-cBP_3S?6c_YOW<U zFIP}lb(yPKl`ag9{zXg+!tX3t>P!U-(;xH)flcOtNYz{z8%)>HQkZON^qYzb8^p0F z3yp!NT1bww1cx#l>hGrNt=7)xJjs)S^DCUF4M1pPe?D&hEuwe$K7l;}Hy4@W+_gE+ zv@BE1PdR@tMY?-2Cm>k0e*{>?;1(~yEhavz?x4vutR~fQoaC{qtwokrFGp<Lg+U4D QKkkY?QPgRm4ux~}2eWyM>i_@% diff --git a/__pycache__/metadata_generator.cpython-38.pyc b/__pycache__/metadata_generator.cpython-38.pyc deleted file mode 100644 index eed2e9474897c4805dc636277bafade56ab8a337..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 908 zcmZuvOK;Oa5T0GzsT&#uK@bTc5uqLkhd4y#hN>bYdLk+y1j=REdbh4CKf<mplqj6q zJAVO={H48e;xF{XjMJ2&GWL8svyXXvyYD-lHlW<=Z;OXMz%O#Hiige~s(pf>K(P#_ ztC3Av<MS!s2gQ|l2Fg3-Q%|vXFz|olewf6BSB0aqhiV@nXrLHVu9)IyfG0iW<LIjZ zBedM@*SeBQR&rk#+Q_OjF&$qE*`zN~?PG)`TY|vi5HTzwZx9PY+gBVhP=KMm$*V<s z^fbbMJQ|NM)kZGULZ!w9i7smEHtbBtskSc6FKTd`sucAM%hAp!9B}7bogEpX=qDKB z@%&SCT;@7DGUJ``PGr-=NM`y_8XFzV<6~*^15=*pxUx}_$hs<N5M*SfRE<N1pr)gB zW`QNG+8@s5E)*gy(n^TC)b<tvu-4_bxPOha;rc+#X!C^-ahs?OdAn#-NZ1KetcQ1b z%D-)YWlQ!QZxv@fob!9Mj;Mkj{9p_|z%zIQZ{Z^nPPa8#bR!~BtjiwG^UN{12;1!! zN@>+UXMG2Q!1?7&7Y%DJ_+rvZ8+>?R0)zoKRFr8E5TPr$ftI`3?7a;C2<I#v1lDDX z{`I#1-9Ev@NL?XwEre?ek(a8@$lem-b1k#giXlQxm*8*u#-bVwv33WwNgVR9)65Kf k$4h)OS(zkhkvM-Y^Q>9<Ul12QU*+DV`>@n%$nd-M2eNz9SpWb4 diff --git a/__pycache__/table_parser.cpython-36.pyc b/__pycache__/table_parser.cpython-36.pyc deleted file mode 100644 index 7272a5afeb889befe0494f81c89160a22c94c6fb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2676 zcmZuzTXP$?6$bXgU0x*9lB|o9IP1ikhOI7X>h$8twC=={rp;92rs=rZac5^mK-8|6 zyEGQGXb$<rv~QZ^Z}cU<p}zyKeez%ML)-6wlBIUK92@}WU=I$yJ>LO;va-_p_`!Da zMTfC}vTIL(@k6xwUl4)`Ua)xC^O)PY6FatdW7qax?AhLrecLzU270FmhCv)~_8TT# z;eEq|w{YT6HiiGpQLm++w?sn(=R9u9cHR-8ToFytI(Ip<^S0<fvm#l(x?p1U+_(B0 z3l?`}H@}&$i5m-Ee}6IVE;xLKG(%e#H_x4T!`g0%wFMXJJ4|jQn{orTjV&gFd~3n- zZE<VCx0u+3W+T~w^|t7Wt#i(C?mx2L_A6k(j(XgLBGpo7L)j<I<>O%R5UsukAsOKV zY`Ac~VZg?o8rb-vfw?aNcnnSZtMs^#k5g4k)pzk8J=Bx4Ptof85SryYceI;(3qE6s zHzNsnAptIApu+|AO*?A|2Iu^!*EUX78*h~AQxl}ewH<h=8q~%WnKCU+yC)}EAx&c_ z^=TzcQ%<xj>#Qn`C$hda9t^2z=5<xtl|9#ZwJc6l6OpM9;_pm#(Y^lt;b}FLhbQWz z`ybsuth3|8w2;R*+u`Gv{nJzpAFC>teO(_82I*K=)FnlgiUfG3hs$6_7K?+?OA{tZ zR%SX$w&1)*W-`XRykUPKpKe@(?%;B1pKzn6g+{Mf4FACyn{mOBYHt&kXwHn~?t(A) z+$j%Wb71@Vh0VKWTho5t5Ln5X4sxK(+04nCx|O$u{g~(O+?hloc*Op)J$K0_!kLRS zLTZZ!S)U(2vG0i1l}@y=vNLl;X9rYxhW^C)guT(OTxeJROS^iZT_vqvAu8lIDDf|R zbua8)Ywv2%B45W@HqdS@9C35ueEa>}n|b*r(Eukx+Ry#T-}JUv(>w5T8;CeKiJCVs zdu!I%Vfouw6HLCG1^GJ*2HR&(oq0HK>N~UMg6Hpwbzs;OP3(mEyg6$Efeq1IICdOi z#J}U}^BHneZ2#5SVe{s9{Q2j=#Lw@}0*V9ky(dh+m%l&rXP(#u=|KD^C^UayU$Q@= zM?UWy8Fy0DvD>d-O#gJIs&WvWRBD*&2T@jz#yUa*jSDbSM3m7{KP}5jN5?W!G8NGW z^>{R@lr`Je>IV`0?U9)5Mey1)>?^R_S8^aH(;(VA_@K9c_?ka8Jej&r6Nn!ld^lYl z$P#%Ty^xA%GHq9b`f_4|OKrIO_5SW~e^-D;h{^CXg)C&Rd1Ra<c}3$_9t1XQWTlX$ ze)VrWy?drHN=uQ}aq|&a@tI18GH%fc5;^J1!qP4dtpYKdpr{6etQ;6GD^Dup)-S<$ zImxsMlpGalUmCw055Xt5OiSYyRbLTFjIS$^f@QT<CsfOfSB+$;C?@680xz$!(h^*) z(md2^T=wZKUXj%rIHl!4!lNvWPe+)<UIzCj&{bkfho5OVM6JOV#;->Of(m6R>az?5 zsgaJbAK;<N0+0}6ucdC#_HBr`vxFhBWx;q<Kx`aw$CjdgRiaM8VWkFXnbpI1<#M## zoIf-Cg)K}=ag)x_w&i8%^Ia(FpF%Jm@(pTE3vvx|8--@m{vv*xw@`96`6^#SYw-?# zYp_qZt{0$de&5t_g7Qrb`XMG7bwGe&-)_!bjf##s?g~e^|0mnMBXt{Zs5=lnt|-=? ztM*9s3la}V;aRwBf~6*;q%V;zhM)ZJv0G;fpCTdP9^%ECB4aqmbo2Tdt`;5j8dQGC zTauVok_;;`#!Zo!PLezyr^V8u*5F3{iUif!YrE2Ypa^kAm)R|}x&aY7VHn=EI`ThE zvl6U&-v$vaD?f2M8l9%Hl9N$UDXAj59YB+|kab^WBTaV`5)PD~w%;(nkUhtE`sE1e zc`A$1)Y%tPe}5nM-1O5Yr&7sCSCKtpq)%m3jWyK_f@E1P^l7T2tfqP~uBnC~jtn_A zP$}&Q^nXX^`2xZOLY}1KLdPBz548?m&$+q~4L?xtKsW6iQtWiaNtH|+yI=1PcM;VA z?TW8TdER?#Nx6F%tGgtAK!QH932{{~QF)JaL|coUdPLUMU#r1*2o5~9S0>dB`Ws<7 zsSpX?v^MoIEo}^>?w<+MvP;NVEBb4veoNwKB<LbHjvgsugpEv5%BmlcApS4Ev;DAF n!jdZdWSJhnq@rg3{e<Ar)vvR%`eBG{>w;IIL)=>9cldt*nY6v$ diff --git a/__pycache__/table_parser.cpython-38.pyc b/__pycache__/table_parser.cpython-38.pyc deleted file mode 100644 index 57f034e254bb97b21195f7f449cc828c52038074..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2694 zcmZuz-H+SG5#L>YOFCJ1C*OyYI82-iqSkjx0;foWAaI%>{dhRIZGtp}1BRzvNt7v4 zUaov+fqJ_1@sRup)JuW9^r?T%zV^xgf*;z>tnTbf5fVGQGegeK4Cgn?uQoOs1g?M6 z<JfA!jLh}R1Li)A@;^X0;WQ)BdZrOIJd2o_t;jO79oc4fBFD^b<igA{Z|X%JCBG-! z;`aB1+vhCuMZld$Og<ORq`_V8UC^j0nv)jy#Rd;}<HDlE@J-$V$%Y_xdqsHr!a;d` zV@0Bl=u9^2Eq-G~%YR<3J1Yt{edJ(l^UVtub&c*O-&#?=y-!3p-Vt5Ub$5yI>fIHo z_xQ~f-6ebnB;9x)w6}PN?_N*}asQR{_MSlw$Z?NqpC?MFbSefYxqdNV?!zeG0wM_J z2=YTY`<_64OyrOshr7T#+ykq=Zho1ZWa2?0OCbjqymxis8pmfa%6mW*nb4XkYhtfx zMPj=`3TXxkNe1dE{~YJQa1Bmi1A5$RYF3omo+WCmz2u}c3p<gcQd>Ngx}ng6c$j8F zyHlaY1=oQ%Qz9?ZBG)!g2TI$ZNOUkMi`+cfv$S1`Y$yZpO!`26BC-S1^Qb>ArlLQT z@85g>UcXFF`bj2EAin;Cr-N}Krw?Q?5d&5BN26q}3Y=nGWJSUagNqoFob=b8%}jER zW>2*r$7!CbIKBltl~A&T&<=IY?bAi~S}u>S9v$FUxw^34(PNX;e^Ejz$|=;~tBTYV z2CK-#TG16<vit~i40In}>g;Q}E#=fMhbJjfUhS_4B^8?lsxfJD^Mx<0CF9=b<lDWa zg&OWx7E~8hJa<v|_&XzQyp-@JNLm%+t$o19BbbNmWAZ|}aVg#SnY4W=ZKG6eAU^6F zh=xmBolAYk=sRk2vQ=+G99<YUSB!6-vmgGkv@5&bL2N)YyXR!-s6D==_QA%jb9&Ao zihSt;_e$mNlls*aS$dUMzqTTv`Qjm4`pZDwu7Yz~zs|QIcL5JzmjG8)PzSurgB3ID z5LWa@D(_cNr2OXJ**;kYKhnqdE2n;=^3Vg|-d1l;?o>`?^Bq7Sbb+tQ2jtJ>FXV5~ zeGrSs+B(b1$QqPS7GIspA|HjrLQWI)QJChlxeB3h<{7}0hgjC(Aj$JWg(o7EBH`gX z<$N|Pq*0sK%8x>@cNfXo-4LwyH2JbeBS(snI9qt(-J^GUhyCaDp{DV|dKd$F@95n{ zdn9rw_V9_2h?PaN7?oFy_O7J<!8eBo)58M}SOT9kJ(W;-<T(vBi{oeih3lCOf}f~% z)0~T3_1@C%EXjFNM#1L*gfC<=6;T77kHy(QWCnJTZv^13_OfC$O7oGn(|lNHt9%NG z7iX!`o)ohz83^s<^C>{c%9C7MSuv1^3+<?aCjhKc$szVMZ5K0<OLRv%_<%hr(%b-C zwvqRhoaX}!!_LxDX-6jcNPtC=YX`kQi|iE4YflxiX&p|g#1wiBY@wZUmVr~g$a#61 zLPK(s8SDp{!OjBt0Uvt}c>}j^0*PAdG{mM8Xd4@d@gr)PRx~Jb=u=>@kfS6|%W1T6 zwOVgZA8Go;G^RD##2A{UqpWSd34-!BKnV3|7YAzq-C{o7f!?xX?vUQ14d^mm+6Gx0 z)-AYkwb;GhbFSI_Sue+QzGyqY0!Bf*-~g>3c9xc+69&E9;*49rQ0(@x+=n;hYe;Sb z=~?pExc(5yZ&6^Igqi9}w2u{^L%nD^{QtlkrxHF!jH&w+UM$fs0+B3su1E3mq2r#5 zJr}LTv2MiiwBU0%D`MS><Hz$PTWjPNn310#`5h8G2d)v!xx}Mv4@T(%@tN=YuNxWK zpC)Mzkh)`>2-oc%0Ugf9i73R`EGwjtAsz{E{x-QN2Qr;0Je{EA0Pl<D3)p9($F!}U z&Y(cYBAYGPAzwI$hj7v@KKptsqzF|Jnh--Z7GW`0*f209>t3P8i3-yao5j4u1_FMh zP-P>T;Es^}KVm$80-`-GhRHlrk&T^0ZiB4HULJ{>9z8>(FIs){9nUo(<Ar<h&B62l zygI^Nd(>2V@0B&^?x67y$=gWqBWoWH>vdM%Kp6tp<WAx-rrV#&(R>Ohcwo*+>>T(r zp<4;(F}!JX68|8yI}&Pe%5}p$g340jPoDe(5=4PPx@Kx7KR^yKD0@h-Y7O$%-}x6l kvGzY+m&Yd<v-t<|0ZA|KR;PXB8{F(spX~vDw&-p8KLx+W>i_@% diff --git a/create_input_instance.py b/create_input_instance.py index 9ef50aa..d01dd88 100644 --- a/create_input_instance.py +++ b/create_input_instance.py @@ -13,9 +13,8 @@ from datetime import datetime """ create_input_instance.py -Create an object containing the data input from the yml file as attributes -This object is then fed to the other scripts -It is to avoid having several times the same code in several files +Create an object containing the data input from the yml file as attributes, which is then fed to the other scripts +This object is created using the data in the input yml file """ @@ -23,7 +22,7 @@ It is to avoid having several times the same code in several files def parse_input(input_file): """ Parse the yml input file to extract data to create the SpeciesData objects - Return a list of dictionaries. Each dictionary contains all the data + Return a list of dictionaries. Each dictionary contains data tied to a species :param input_file: :return: @@ -40,24 +39,58 @@ def parse_input(input_file): try: yaml_dict = yaml.safe_load(stream) for k, v in yaml_dict.items(): + if k == "config": + pass parsed_sp_dict_list.append(v) - except yaml.YAMLError as exc: - logging.debug(exc) + except yaml.YAMLError: + logging.critical("YAMLError raised") + sys.exit() return parsed_sp_dict_list +def parse_args(): + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" + ", following the protocol @ " + "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") + + parser.add_argument("-i", "--input", + help="Input file (yml)") + + parser.add_argument("-v", "--verbose", + help="Increase output verbosity", + action="store_false") + + parser.add_argument("--deploy-stacks", + help="Create and deploy the stacks of services", + action="store_true") + + parser.add_argument("--load-data", + help="Create src_data directory tree, copy datasets to src_data, and load these datasets " + "into the instance, DEV", + action="store_true") + + parser.add_argument("--run-workflow", + help="Run main workflow (load data into chado, sync all with tripal, " + "index tripal data, populate materialized view, " + "create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse") + + args = parser.parse_args() + + return args + class SpeciesData: """ This class contains attributes and functions to interact with the galaxy container of the GGA environment - + Parent class of LoadData, DeploySpeciesStack and RunWorkflow """ - def __init__(self, parameters_dictionary, args): + def __init__(self, parameters_dictionary): self.parameters_dictionary = parameters_dictionary - self.args = args + self.args = parse_args() # Not a good design self.species = parameters_dictionary["description"]["species"] self.genus = parameters_dictionary["description"]["genus"] self.strain = parameters_dictionary["description"]["strain"] @@ -105,57 +138,8 @@ class SpeciesData: self.do_update = False # Update the instance (in histories corresponding to the input) instead of creating a new one // TODO: move this variable inside methods self.api_key = "dev" # API key used to communicate with the galaxy instance. Set to "dev" for the moment. Cannot be used to do user-tied actions self.args = args - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction with galaxy instances for GGA" - ", following the protocol @ " - "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") - - # Dev arguments, TODO: remove in production branch! - parser.add_argument("--full", - help="Run everything, from src_data dir tree creation, moving data files (abims) into src_data," - "modify headers (abims), generate blast banks (doesn't commit them: TODO), initialize GGA instance, load the data and run," - " the main workflow. To update/add data to container, use --update in conjunction to --full (TODO)") - - parser.add_argument("--init-instance", - help="Initialization of galaxy instance. Run first in an empty instance, DEV", - action="store_true") - - parser.add_argument("--deploy-stacks", - help="Create and deploy the stacks of services", - action="store_true") - - parser.add_argument("--load-data", - help="Create src_data directory tree, copy datasets to src_data, and load these datasets into the instance, DEV", - action="store_true") - - parser.add_argument("--run-workflow", - help="Run main workflow (load data into chado, sync all with tripal, " - "index tripal data, populate materialized view, " - "create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse") - - - # Production arguments - parser.add_argument("input", type=str, help="Input file (yml)") - - parser.add_argument("-v", "--verbose", - help="Increase output verbosity", - action="store_false") - - parser.add_argument("--update", - help="Update an already integrated organisms with new data from input file, docker-compose.yml will not be re-generated" - ", assuming the instances for the organisms are already generated and initialized", - action="store_false") - - parser.add_argument("--dir", - help="Path of the main directory, either absolute or relative, defaults to current directory", - default=os.getcwd()) - - args = parser.parse_args() - - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - + if self.args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) diff --git a/deploy_stacks.py b/deploy_stacks.py index 73b766e..45a89d0 100755 --- a/deploy_stacks.py +++ b/deploy_stacks.py @@ -1,64 +1,31 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import bioblend -import bioblend.galaxy.objects -from bioblend import galaxy + import argparse import os import subprocess import logging import sys -import json import yaml import re -import metadata_generator -import docker_compose_generator -import table_parser +from gga_autoload.gga_load_data import table_parser import fnmatch import shutil from datetime import datetime -import create_input_instance + """ deploy_stacks.py - - -TODO: -- add config file (inside repo or outside with argument -- update existing history -- clean/delete instance? -- delete stack? -- commit the files for blast banks. - -TODO EOSC/Cloudification: -- divide into 2 general-use scripts - - create docker swarm, stacks, etc... (docker side) - - load data into libraries (method to load it at init, and a method/script to load it separately (galaxy side) (alb: galaxy_data_libs_SI does this already?) - -STEPS: -- read input (yml, maybe xlsx later) -- create dir_tree -- DONE -- find and copy data -- DONE -- change file headers, etc.. (ext scripts for data manipulation) -- IN PROGRESS -- generate blast banks and links -- NOT DONE -- generate and edit nginx confs -- DONE -- generate dc and start the containers -- IN PROGRESS -- connect to instance and launch tools>workflows -- IN PROGRESS -- generate and update metadata -- IN PROGRESS - - -NOTES: -- A master API key cannot be used, as some functions are tied to a user (like creating an history), so the access to the - galaxy instance must be done using email and password (definable in yml_example_input.yml) - +Usage: $ python3 deploy_stacks.py -i example.yml [OPTIONS] """ def parse_input(input_file): """ - Parse the yml, json or tabulated input in order to set attributes for the Autoload class + Parse the yml input file to extract data to create the SpeciesData objects + Return a list of dictionaries. Each dictionary contains data tied to a species :param input_file: :return: @@ -75,24 +42,24 @@ def parse_input(input_file): try: yaml_dict = yaml.safe_load(stream) for k, v in yaml_dict.items(): + if k == "config": + pass parsed_sp_dict_list.append(v) - except yaml.YAMLError as exc: - logging.debug(exc) + except yaml.YAMLError as exit_code: + logging.critical(exit_code + " (YAML input file might be incorrect)") + sys.exit() return parsed_sp_dict_list - - -class DeploySpeciesStacks: +class DeploySpeciesStack: """ - The class DeploySpeciesStacks + Deploy a stack of services for a given species """ - def __init__(self, parameters_dictionary, args): + def __init__(self, parameters_dictionary): self.parameters_dictionary = parameters_dictionary - self.args = args self.species = parameters_dictionary["description"]["species"] self.genus = parameters_dictionary["description"]["genus"] self.strain = parameters_dictionary["description"]["strain"] @@ -115,7 +82,8 @@ class DeploySpeciesStacks: self.full_name = " ".join([self.genus_uppercase, self.species, self.strain, self.sex]) self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) self.genus_species = self.genus_lowercase + "_" + self.species - self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" # Testing with localhost/scratchgmodv1 + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" + # Testing with localhost/scratchgmodv1 self.instance = None self.history_id = None self.library_id = None @@ -129,59 +97,24 @@ class DeploySpeciesStacks: self.datasets = dict() self.source_files = dict() self.workflow_name = None - self.docker_compose_generator = None self.metadata = dict() - self.api_key = "dev" # TODO: set the key in config file --> saved for later (master api key access actions are limited) + self.api_key = "master" # TODO: set the key in config file --> saved for later (master api key access actions are limited) if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data else: self.source_data_dir = parameters_dictionary["data"]["parent_directory"] - # Directory/subdirectories where data files are located (fasta, gff, ...), point to a directory as close as possible to the source files + # Directory/subdirectories where data files are located (fasta, gff, ...) self.do_update = False - # Update the instance (in histories corresponding to the input) instead of creating a new one // TODO: move this variable inside methods - self.api_key = "dev" - # API key used to communicate with the galaxy instance. Set to "dev" for the moment // TODO: find a way to create, store then use the api key safely - - - # def get_source_data(self, max_depth): - # """ - # TODO: saved for later just in case - # - # Find and copy source data files to src_data directory tree - # - recursively search for the correct files (within a fixed max depth) - # - requires the organism src_data directory tree to already be properly created for the organism (run generate_dir_tree) - # - the source files must have "transcripts", "proteins"/"pep", "genome" in their name, and a gff extension - # - # """ - # src_data_dir = os.path.join(self.species_dir, "/src_data") - # sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # example with VARIABLE - # - # # The regex works using the species attribute (unique) --> regex is probably not necessary - # sp_regex = "" - # for i in self.species: - # sp_regex = sp_regex + "?=\w*" + i + ")" - # sp_regex = sp_regex + ")\w+" - # re_dict = dict() - # re_dict["gff"] = None - # re_dict["transcripts"] = None - # re_dict["proteins"] = None - # re_dict["genome"] = None - # reg = None - # - # for dirpath, dirnames, files in os.walk(self.source_data_dir): - # for f in files: - # if self.species and self.sex in f: - # logging.info("File found") - - - - - def generate_dir_tree(self): + # Update the instance (in histories corresponding to the input) instead of creating a new one + self.api_key = "master" + # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder re + + + def make_directory_tree(self): """ Generate the directory tree for an organism and move datasets into src_data - TODO: DOCKER -- this is the one the "docker" parts of the script - :return: """ @@ -213,14 +146,37 @@ class DeploySpeciesStacks: # self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) organism_annotation_dir, organism_genome_dir = None, None - # Create src_data dir tree + # Creation (or updating) of the src_data directory tree + # Depth 0-1 try: os.mkdir("./src_data") os.mkdir("./src_data/annotation") os.mkdir("./src_data/genome") os.mkdir("./src_data/tracks") + except FileExistsError: + if self.do_update: + logging.info("Updating src_data directory tree") + else: + logging.debug("The src_data directory tree already exists") + except PermissionError: + logging.critical("Insufficient permission to create src_data directory tree") + sys.exit() + + # Depth 2 + try: os.mkdir("./src_data/annotation/" + self.species_folder_name) os.mkdir("./src_data/genome/" + self.species_folder_name) + except FileExistsError: + if self.do_update: + logging.info("Updating src_data directory tree") + else: + logging.debug("The src_data directory tree already exists") + except PermissionError: + logging.critical("Insufficient permission to create src_data directory tree") + sys.exit() + + # Depth 3 + try: os.mkdir("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.ogs_version) os.mkdir("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) organism_annotation_dir = os.path.abspath("./src_data/annotation/" + self.species_folder_name + "/OGS" + self.genome_version) @@ -234,6 +190,12 @@ class DeploySpeciesStacks: logging.critical("Insufficient permission to create src_data directory tree") sys.exit() + + def make_compose_files(self): + """ + + :return: + """ # Path to the templates used to generate the custom docker-compose files for an input species stack_template_path = self.script_dir + "/templates/stack-organism.yml" traefik_template_path = self.script_dir + "/templates/traefik.yml" @@ -248,20 +210,27 @@ class DeploySpeciesStacks: with open(stack_template_path, 'r') as infile: organism_content = list() for line in infile: - # One-liner to replace placeholders by the genus and species + # Replace placeholders in the compose file organism_content.append( - line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", str(self.genus_uppercase + " " + self.species)).replace("Genus/species", str(self.genus_uppercase + "/" + self.species)).replace("gspecies", str( self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", genus_species_strain_sex)) + line.replace("genus_species", str(self.genus.lower() + "_" + self.species)).replace("Genus species", + str( + self.genus_uppercase + " " + self.species)).replace( + "Genus/species", str(self.genus_uppercase + "/" + self.species)).replace("gspecies", str( + self.genus.lower()[0] + self.species)).replace("genus_species_strain_sex", + genus_species_strain_sex)) with open("./docker-compose.yml", 'w') as outfile: for line in organism_content: outfile.write(line) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers try: os.mkdir("../traefik") os.mkdir("../traefik/authelia") shutil.copy(authelia_config_path, "../traefik/authelia/configuration.yml") shutil.copy(authelia_users_path, "../traefik/authelia/users.yml") # TODO: custom users (add a config file?) - subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers + subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Create mounts for the containers except FileExistsError: logging.debug("Traefik directory already exists") try: @@ -271,11 +240,9 @@ class DeploySpeciesStacks: subprocess.call(["python3", self.script_dir + "/create_mounts.py"], cwd=working_dir) - def get_source_data_files_from_path(self): """ - Find all files in source_data directory, to link the matching files in the src_data dir tree - + Link data files :return: """ @@ -290,7 +257,7 @@ class DeploySpeciesStacks: organism_genome_dir = os.path.abspath("./src_data/genome/" + self.species_folder_name + "/v" + self.genome_version) for dirpath, dirnames, files in os.walk(self.source_data_dir): - if "0" in str(dirpath): # Ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same) #alb + if "0" in str(dirpath): # Ensures to take the correct files (other dirs hold files with the correct names, but I don't know if they are the same), this is for Phaeoexplorer only for f in files: if "Contaminants" not in str(f): try: @@ -322,7 +289,6 @@ class DeploySpeciesStacks: logging.warning("Error raised (NotADirectoryError)") - def deploy_stack(self): """ Call the script "deploy.sh" used to initiliaze the swarm cluster if needed and launch/update the stack @@ -330,458 +296,10 @@ class DeploySpeciesStacks: :return: """ # Launch and update docker stacks (cf docs) - # TODO: add a fail condition? subprocess.call(["sh", self.script_dir + "/deploy.sh", self.genus_species, self.main_dir + "/traefik"]) - - - def modify_fasta_headers(self): - """ - Change the fasta headers before integration. - - :return: - """ - - try: - os.chdir(self.species_dir) - working_dir = os.getcwd() - except OSError: - logging.info("Cannot access " + self.species_dir + ", run with higher privileges") - logging.info("Fatal error: exit") - sys.exit() - self.source_files = dict() - annotation_dir, genome_dir = None, None - for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: - if "annotation/" in d: - annotation_dir = d - for f in os.listdir(d): - if f.endswith("proteins.fasta"): - self.source_files["proteins_file"] = os.path.join(d, f) - elif f.endswith("transcripts-gff.fa"): - self.source_files["transcripts_file"] = os.path.join(d, f) - elif f.endswith(".gff"): - self.source_files["gff_file"] = os.path.join(d, f) - elif "genome/" in d: - genome_dir = d - for f in os.listdir(d): - if f.endswith(".fa"): - self.source_files["genome_file"] = os.path.join(d, f) - logging.debug("source files found:") - for k, v in self.source_files.items(): - logging.debug("\t" + k + "\t" + v) - - # Changing headers in the *proteins.fasta file from >mRNA* to >protein* - # production version - modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh", - self.source_files["proteins_file"]] - # test version - # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", - # self.source_files["proteins_file"]] - logging.info("Changing fasta headers: " + self.source_files["proteins_file"]) - subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) - # production version - modify_pep_headers = [str(self.main_dir) + "/gga_load_data/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh", - self.source_files["proteins_file"]] - # test version - # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh", - # self.source_files["proteins_file"]] - logging.info("Changing fasta headers: " + self.source_files["transcripts_file"]) - subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) - - # src_data cleaning - if os.path.exists(annotation_dir + "outfile"): - subprocess.run(["mv", annotation_dir + "/outfile", self.source_files["proteins_file"]], - stdout=subprocess.PIPE, - cwd=annotation_dir) - if os.path.exists(annotation_dir + "gmon.out"): - subprocess.run(["rm", annotation_dir + "/gmon.out"], - stdout=subprocess.PIPE, - cwd=annotation_dir) - - - - - def generate_blast_banks(self): - """ - TODO - Automatically generate blast banks for a species - TODO: auto commit the files? - - :return: - """ - - - def connect_to_instance(self): - """ - TODO: move in init/access - TODO: password - Test the connection to the galaxy instance for the current organism - Exit if it cannot connect to the instance - """ - self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", verify=False) - logging.info("Connecting to the galaxy instance ...") - try: - self.instance.histories.get_histories() - self.tool_panel = self.instance.tools.get_tool_panel() - except bioblend.ConnectionError: - logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) - sys.exit() - else: - logging.info("Successfully connected to galaxy instance @ " + self.instance_url) - self.instance.histories.create_history(name="FOO") - - - - - - - def setup_data_libraries(self): - """ - - generate blast banks and docker-compose - - load data into the galaxy container with the galaxy_data_libs_SI.py script - - :return: - """ - - try: - logging.info("Loading data into the galaxy container") - subprocess.run("../serexec genus_species_galaxy /tool_deps/_conda/bin/python /opt/setup_data_libraries.py", - shell=True) - except subprocess.CalledProcessError: - logging.info("Cannot load data into the galaxy container for " + self.full_name) - pass - else: - logging.info("Data successfully loaded into the galaxy container for " + self.full_name) - - self.get_species_history_id() - # self.get_instance_attributes() - # - # # import all datasets into current history - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) - # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) - - - - - - def get_species_history_id(self): - """ - Set and return the current species history id in its galaxy instance - - :return: - """ - histories = self.instance.histories.get_histories(name=str(self.full_name)) - self.history_id = histories[0]["id"] - self.instance.histories.show_history(history_id=self.history_id) - - return self.history_id - - - - - def create_species_history(self): - histories = self.instance.histories.get_histories(name=str(self.full_name)) - print("\n" + str(histories) + "\n" + self.full_name + "\n") - if not histories: - self.instance.histories.create_history(name="FOO") - print("Created history!") - - - - - - def get_instance_attributes(self): - """ - retrieves instance attributes: - - working history ID - - libraries ID (there should only be one library!) - - datasets IDs - - :return: - """ - histories = self.instance.histories.get_histories(name=str(self.full_name)) - self.history_id = histories[0]["id"] - logging.debug("history ID: " + self.history_id) - libraries = self.instance.libraries.get_libraries() # normally only one library - self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("library ID: " + self.history_id) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) - - folders_ids = {} - current_folder_name = "" - for i in instance_source_data_folders: - for k, v in i.items(): - if k == "name": - folders_ids[v] = 0 - current_folder_name = v - if k == "id": - folders_ids[current_folder_name] = v - logging.info("Folders and datasets IDs: ") - self.datasets = dict() - for k, v in folders_ids.items(): - logging.info("\t" + k + ": " + v) - if k == "/genome": - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: - if type(e) == dict: - if e["name"].endswith(".fa"): - self.datasets["genome_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif k == "/annotation/" + self.genus_species: - sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) - for k2, v2 in sub_folder_content.items(): - for e in v2: - if type(e) == dict: - # TODO: manage several files of the same type and manage versions - if e["name"].endswith("transcripts-gff.fa"): - self.datasets["transcripts_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith("proteins.fasta"): - self.datasets["proteins_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith(".gff"): - self.datasets["gff_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - elif e["name"].endswith("MALE"): - self.datasets["gff_file"] = e["ldda_id"] - logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) - - - - - - - def init_instance(self): - """ - Galaxy instance startup in preparation for running workflows - - remove Homo sapiens from the chado database. - - add organism and analyses into the chado database --> separate - - get any other existing organisms IDs before updating the galaxy instance --> separate - - TODO: move the library and analysis/data stuff to a separate function - :return: - """ - - self.connect_to_instance() - self.get_species_history_id() - histories = self.instance.histories.get_histories(name=str(self.full_name)) - # Create the first history - if not histories: - self.instance.histories.create_history(name=str(self.full_name)) - self.history_id = histories[0]["id"] - logging.debug("history ID: " + self.history_id) - # libraries = self.instance.libraries.get_libraries() # routine check: one library - # self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library - logging.debug("library ID: " + self.history_id) - instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) - - # Delete Homo sapiens from Chado database - logging.debug("Getting 'Homo sapiens' ID in instance's chado database") - get_sapiens_id_job = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"genus": "Homo", "species": "sapiens"}) - get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] - get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) - try: - logging.debug("Deleting Homo 'sapiens' in the instance's chado database") - get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] - sapiens_id = str( - get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"organism": str(sapiens_id)}) - except bioblend.ConnectionError: - logging.debug("Homo sapiens isn't in the instance's chado database") - except IndexError: - logging.debug("Homo sapiens isn't in the instance's chado database") - pass - - # TODO: the following actions should be done in a separate function (in case if the user wants to do everything him/herself -- for EOSC) - # Add organism (species) to chado - logging.info("Adding organism to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", - history_id=self.history_id, - tool_inputs={"abbr": self.abbreviation, - "genus": self.genus, - "species": self.species, - "common": self.common}) - # Add OGS analysis to chado - logging.info("Adding OGS analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version, - "program": "Performed by Genoscope", - "programversion": str("OGS" + self.ogs_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - - # Add genome analysis to chado - logging.info("Adding genome analysis to the instance's chado database") - self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version, - "program": "Performed by Genoscope", - "programversion": str("genome v" + self.genome_version), - "sourcename": "Genoscope", - "date_executed": self.date}) - self.get_organism_and_analyses_ids() - logging.info("Finished initializing instance") - - - - - - - - def run_workflow(self, workflow_name, workflow_parameters, datamap): - """ - Run the "main" workflow in the galaxy instance - - import data to library - - load fasta and gff - - sync with tripal - - add jbrowse + organism - - fill in the tripal views - - TODO: map tool name to step id - :param workflow_name: - :param workflow_parameters: - :param datamap: - :return: - """ - - logging.debug("running workflow: " + str(workflow_name)) - workflow_ga_file = self.main_dir + "Galaxy-Workflow-" + workflow_name + ".ga" - if self.strain != "": - custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga" - custom_ga_file_path = os.path.abspath(custom_ga_file) - else: - custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga" - custom_ga_file_path = os.path.abspath(custom_ga_file) - with open(workflow_ga_file, 'r') as ga_in_file: - workflow = str(ga_in_file.readlines()) - # ugly fix for the jbrowse parameters - workflow = workflow.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}', - str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"') - workflow = workflow.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"', - str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"') - workflow = workflow.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import - # test - workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', - "http://localhost/sp/" + self.genus_lowercase+ "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") - # production - # workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', - # "http://abims--gga.sb-roscoff.fr/sp/" + self.genus_lowercase + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") - workflow = workflow[2:-2] # if the line under doesn't output a correct json - # workflow = workflow[:-2] # if the line above doesn't output a correct json - - workflow_dict = json.loads(workflow) - - self.instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) - self.workflow_name = workflow_name - workflow_attributes = self.instance.workflows.get_workflows(name=self.workflow_name) - workflow_id = workflow_attributes[0]["id"] - show_workflow = self.instance.workflows.show_workflow(workflow_id=workflow_id) - logging.debug("Workflow ID: " + workflow_id) - - logging.debug("Inputs:") - logging.debug(show_workflow["Inputs"]) - self.instance.workflows.invoke_workflow(workflow_id=workflow_id, - history_id=self.history_id, - params=workflow_parameters, - inputs=datamap, - inputs_by="") - self.instance.workflows.delete_workflow(workflow_id=workflow_id) - - - - - - - def load_data_in_galaxy(self): - """ - Function to load the src_data folder in galaxy - - :return: - """ - - logging.info("Loading data in galaxy") - - return None - - - - - - def get_organism_and_analyses_ids(self): - """ - Retrieve current organism ID and OGS and genome chado analyses IDs (needed to run some tools as Tripal/Chado - doesn't accept organism/analyses names as valid inputs - - :return: - """ - # Get the ID for the current organism in chado - org = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", - history_id=self.history_id, - tool_inputs={"genus": self.genus, "species": self.species}) - org_job_out = org["outputs"][0]["id"] - org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) - try: - org_output = json.loads(org_json_output)[0] - self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools - except IndexError: - logging.debug("no organism matching " + self.full_name + " exists in the instance's chado database") - - # Get the ID for the OGS analysis in chado - ogs_analysis = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version}) - ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"] - ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out) - try: - ogs_analysis_output = json.loads(ogs_analysis_json_output)[0] - self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) - except IndexError: - logging.debug("no matching OGS analysis exists in the instance's chado database") - - # Get the ID for the genome analysis in chado - genome_analysis = self.instance.tools.run_tool( - tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", - history_id=self.history_id, - tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version}) - genome_analysis_job_out = genome_analysis["outputs"][0]["id"] - genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out) - try: - genome_analysis_output = json.loads(genome_analysis_json_output)[0] - self.genome_analysis_id = str(genome_analysis_output["analysis_id"]) - except IndexError: - logging.debug("no matching genome analysis exists in the instance's chado database") - - - - - def clean_instance(self): - """ - TODO: method to purge the instance from analyses and organisms - :return: - """ - return None - - - - - def filter_empty_not_empty_items(li): ret = {"empty": [], "not_empty": []} for i in li: @@ -793,42 +311,18 @@ def filter_empty_not_empty_items(li): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction with galaxy instances for GGA" + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" ", following the protocol @ " "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") - # Dev arguments, TODO: remove in production branch! - parser.add_argument("--full", - help="Run everything, from src_data dir tree creation, moving data files (abims) into src_data," - "modify headers (abims), generate blast banks (doesn't commit them: TODO), initialize GGA instance, load the data and run," - " the main workflow. To update/add data to container, use --update in conjunction to --full (TODO)") - parser.add_argument("--init-instance", - help="Initialization of galaxy instance. Run first in an empty instance, DEV", - action="store_true") - parser.add_argument("--load-data", - help="Create src_data directory tree, copy datasets to src_data, and load these datasets into the instance, DEV", - action="store_true") - parser.add_argument("--run-main", - help="Run main workflow (load data into chado, sync all with tripal, " - "index tripal data, populate materialized view, " - "create a jbrowse for the current genus_species_strain_sex and add organism to jbrowse") - parser.add_argument("--generate-docker-compose", - help="Generate docker-compose.yml for current species, DEV") - parser.add_argument("--link-source", - help="Find source files in source data dir and copy them to src_data, DEV, OBSOLETE", - action="store_true") - - # Production arguments - parser.add_argument("input", type=str, help="Input file (yml)") + + parser.add_argument("input", + type=str, + help="Input file (yml)") + parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_false") - parser.add_argument("--update", - help="Update an already integrated organisms with new data from input file, docker-compose.yml will not be re-generated" - ", assuming the instances for the organisms are already generated and initialized", - action="store_false") - parser.add_argument("--dir", - help="Path of the main directory, either absolute or relative, defaults to current directory", - default=os.getcwd()) args = parser.parse_args() @@ -837,93 +331,23 @@ if __name__ == "__main__": else: logging.basicConfig(level=logging.INFO) - logging.info("Start") + logging.info("Deploy stacks: start") sp_dict_list = parse_input(args.input) for sp_dict in sp_dict_list: - al = Autoload(parameters_dictionary=sp_dict, args=args) - al.main_dir = os.path.abspath(args.dir) - if args.load_data: - """ - Full workflow - TODO: change later (docker side / load data side / galaxy side) - """ - # al.generate_dir_tree() - # logging.info("Successfully generated the directory tree for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - # - # # al.get_source_data_files_from_path() - # logging.info("Successfully retrieved source data files for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - # - # al.deploy_stack() - # logging.info("Successfully deployed containers stack for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - # - al.connect_to_instance() - logging.info("Connected to instance") - # - # al.create_species_history() - # logging.info("Created a history") - # - # al.setup_data_libraries() - # logging.info("Setting up data libraries") - - # al.init_instance() - # logging.info("Successfully initialized instance for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - - # al.setup_data_libraries() - # logging.info("Successfully set up data libraries in galaxy for " + al.genus[0].upper() + ". " + al.species + " " + al.strain + " " + al.sex) - - - # if args.init_instance: - # logging.info(" Initializing the galaxy instance") - # al.init_instance() - # al.get_instance_attributes() - # # metadata[genus_species_strain_sex]["initialized"] = True - # if args.load_data: - # logging.info("Loading data into galaxy") - # # al.load_data() - # # metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True - # if args.run_main: - # logging.info("Running main workflow") - # al.get_organism_and_analyses_ids() - # workflow_parameters = dict() - # workflow_parameters["0"] = {} - # workflow_parameters["1"] = {} - # workflow_parameters["2"] = {} - # workflow_parameters["3"] = {} - # workflow_parameters["4"] = {"organism": al.org_id, - # "analysis_id": al.genome_analysis_id, - # "do_update": "true"} - # workflow_parameters["5"] = {"organism": al.org_id, - # "analysis_id": al.ogs_analysis_id} - # workflow_parameters["6"] = {"organism_id": al.org_id} - # workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id} - # workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id} - # workflow_parameters["9"] = {"organism_id": al.org_id} - # workflow_parameters["10"] = {} - # workflow_parameters["11"] = {} - # - # al.datamap = dict() - # al.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]} - # al.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]} - # al.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]} - # al.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]} - # - # al.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap) - # # metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main") - # - # if args.link_source: - # print('DEV') - # al.generate_dir_tree() - # print(al.main_dir) - # print(al.species_dir) - - logging.info("Exit") - - - -def main(species_data): - """ - "Main" function + o = DeploySpeciesStack(parameters_dictionary=sp_dict) + o.main_dir = os.path.abspath(args.dir) + + # dss.make_directory_tree() + # logging.info("Successfully generated the directory tree for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + dss.make_compose_files() + logging.info("Successfully generated the directory tree for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + # dss.get_source_data_files_from_path() + # logging.info("Successfully retrieved source data files for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + # dss.deploy_stack() + # logging.info("Successfully deployed containers stack for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + + logging.info("Deploy stacks: done") - :return: - """ - print("OK") \ No newline at end of file diff --git a/docker_compose_generator.py b/docker_compose_generator.py index d5fe776..81fdcc3 100755 --- a/docker_compose_generator.py +++ b/docker_compose_generator.py @@ -6,23 +6,9 @@ import logging # import json """ -docker-compose.yml generator -The method "generate" works for both docker-compose architecture (old), or docker stack (new) -This method will write a formatted docker-compose.yml for the specified organism (only requires genus and species) - -Made to work in the integration streamlined script "deploy_stacks.py" but can be used as a standalone (either with a CLI -or in another python file as a module) - -Dockerfiles are specific to genus-species: a same organism can have several strains and sexes integrated, but only one -set of applications are used (see metadata files for details about what strains/sexes have been integrated for -an organism) +docker-compose_generator.py -TODO: write the whole yml dict from scratch (would allow the script to be more reusable into the future and make it -more customizable while being clearer (instead of the default yml string or input docker-compose template) - -TODO: read json - -API master key or galaxy: MASTER_API_KEY: XXXXXXX (alphanum, user prompt or git env variable) +This method will write a formatted docker-compose.yml for the specified organism (only requires genus and species) """ diff --git a/examples/example_input.json b/examples/json_example_input.json similarity index 100% rename from examples/example_input.json rename to examples/json_example_input.json diff --git a/examples/yml_example_input.yml b/examples/yml_example_input.yml index af0fe12..10395ab 100644 --- a/examples/yml_example_input.yml +++ b/examples/yml_example_input.yml @@ -3,13 +3,13 @@ # list of species for which the script will have to create these stacks/load data into galaxy/run workflows # Add new config option using a config scalar - -config: # Simple config part, allowing the user to create his/her own admin account (default is gga) - # WARNING: not supported currently, as the default connection is using the gga account - admin: +config: + admins: # Add admin account WARNING: not supported currently, as the default connection through a web browser is using the gga account username: "nflantier" # Desired admin username password: "blanquette" # Desired admin password - email: "noel.flantier@galaxy.org" # Desired admin email + email: "noel.flantier@mail.com" # Desired admin email + master_api_key: "master" # Master API key is useless at the moment + url_prefix: "http://localhost/ # URL prefix to forward ectocarpus_sp1: # Dummy value the user gives to designate the species (isn't used by the script) # Species description, leave blank if unknown or you don't want it to be used diff --git a/setup_data_libraries.py b/galaxy_data_libs_SI.py similarity index 100% rename from setup_data_libraries.py rename to galaxy_data_libs_SI.py diff --git a/load_data.py b/load_data.py index 4f3fbbd..dd5cb3d 100644 --- a/load_data.py +++ b/load_data.py @@ -5,25 +5,262 @@ import bioblend import bioblend.galaxy.objects from bioblend import galaxy +import argparse +import os +import subprocess import logging import sys -import deploy_stacks -import create_input_instance +import yaml +import re +from datetime import datetime -""" +""" load_data.py -Find source data files using the information provided in the input file. -Copy these source data files over into the src_data directory +Usage: $ python3 deploy_stacks.py -i example.yml [OPTIONS] +""" + -Load the data into Galaxy using the script provided by Anthony Bretaudeau (setup_data_libraries) -Also create/update the species history (TODO: Updating history) +def parse_input(input_file): + """ + Parse the yml input file to extract data to create the SpeciesData objects + Return a list of dictionaries. Each dictionary contains data tied to a species + :param input_file: + :return: + """ -""" + parsed_sp_dict_list = [] + + if str(input_file).endswith("yml") or str(input_file).endswith("yaml"): + logging.debug("Input format used: YAML") + else: + logging.critical("Error, please input a YAML file") + sys.exit() + with open(input_file, 'r') as stream: + try: + yaml_dict = yaml.safe_load(stream) + for k, v in yaml_dict.items(): + if k == "config": + pass + parsed_sp_dict_list.append(v) + except yaml.YAMLError as exit_code: + logging.critical(exit_code + " (YAML input file might be incorrect)") + sys.exit() + return parsed_sp_dict_list + + +class LoadData: + """ + Load data from the src_data subfolders into the galaxy instance's history of a given species + + """ + + def __init__(self, parameters_dictionary): + self.parameters_dictionary = parameters_dictionary + self.species = parameters_dictionary["description"]["species"] + self.genus = parameters_dictionary["description"]["genus"] + self.strain = parameters_dictionary["description"]["strain"] + self.sex = parameters_dictionary["description"]["sex"] + self.common = parameters_dictionary["description"]["common_name"] + self.date = datetime.today().strftime("%Y-%m-%d") + self.origin = parameters_dictionary["description"]["origin"] + self.performed = parameters_dictionary["data"]["performed_by"] + if parameters_dictionary["data"]["genome_version"] == "": + self.genome_version = "1.0" + else: + self.genome_version = parameters_dictionary["data"]["genome_version"] + if parameters_dictionary["data"]["ogs_version"] == "": + self.ogs_version = "1.0" + else: + self.ogs_version = parameters_dictionary["data"]["ogs_version"] + self.genus_lowercase = self.genus[0].lower() + self.genus[1:] + self.genus_uppercase = self.genus[0].upper() + self.genus[1:] + self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) + self.full_name = " ".join([self.genus_uppercase, self.species, self.strain, self.sex]) + self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) + self.genus_species = self.genus_lowercase + "_" + self.species + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" + # Testing with localhost/scratchgmodv1 + self.instance = None + self.history_id = None + self.library_id = None + self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + self.main_dir = None + self.species_dir = None + self.org_id = None + self.genome_analysis_id = None + self.ogs_analysis_id = None + self.tool_panel = None + self.datasets = dict() + self.source_files = dict() + self.workflow_name = None + self.metadata = dict() + self.api_key = "master" # TODO: set the key in config file --> saved for later (master api key access actions are limited) + if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": + self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data + else: + self.source_data_dir = parameters_dictionary["data"]["parent_directory"] + # Directory/subdirectories where data files are located (fasta, gff, ...) + self.do_update = False + # Update the instance (in histories corresponding to the input) instead of creating a new one + self.api_key = "master" + # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder re + + + def modify_fasta_headers(self): + """ + Change the fasta headers before integration. + + :return: + """ + + try: + os.chdir(self.species_dir) + working_dir = os.getcwd() + except OSError: + logging.info("Cannot access " + self.species_dir + ", run with higher privileges") + logging.info("Fatal error: exit") + sys.exit() + self.source_files = dict() + annotation_dir, genome_dir = None, None + for d in [i[0] for i in os.walk(os.getcwd() + "/src_data")]: + if "annotation/" in d: + annotation_dir = d + for f in os.listdir(d): + if f.endswith("proteins.fasta"): + self.source_files["proteins_file"] = os.path.join(d, f) + elif f.endswith("transcripts-gff.fa"): + self.source_files["transcripts_file"] = os.path.join(d, f) + elif f.endswith(".gff"): + self.source_files["gff_file"] = os.path.join(d, f) + elif "genome/" in d: + genome_dir = d + for f in os.listdir(d): + if f.endswith(".fa"): + self.source_files["genome_file"] = os.path.join(d, f) + logging.debug("source files found:") + for k, v in self.source_files.items(): + logging.debug("\t" + k + "\t" + v) + + # Changing headers in the *proteins.fasta file from >mRNA* to >protein* + # production version + modify_pep_headers = [str(self.main_dir) + "/gga_load_data/utils/phaeoexplorer-change_pep_fasta_header.sh", + self.source_files["proteins_file"]] + # test version + # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_pep_fasta_header.sh", + # self.source_files["proteins_file"]] + logging.info("Changing fasta headers: " + self.source_files["proteins_file"]) + subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) + # production version + modify_pep_headers = [str(self.main_dir) + "/gga_load_data/utils/phaeoexplorer-change_transcript_fasta_header.sh", + self.source_files["proteins_file"]] + # test version + # modify_pep_headers = ["/home/alebars/gga/phaeoexplorer-change_transcript_fasta_header.sh", + # self.source_files["proteins_file"]] + logging.info("Changing fasta headers: " + self.source_files["transcripts_file"]) + subprocess.run(modify_pep_headers, stdout=subprocess.PIPE, cwd=annotation_dir) + + # src_data cleaning + if os.path.exists(annotation_dir + "outfile"): + subprocess.run(["mv", annotation_dir + "/outfile", self.source_files["proteins_file"]], + stdout=subprocess.PIPE, + cwd=annotation_dir) + if os.path.exists(annotation_dir + "gmon.out"): + subprocess.run(["rm", annotation_dir + "/gmon.out"], + stdout=subprocess.PIPE, + cwd=annotation_dir) + + + def setup_data_libraries(self): + """ + - generate blast banks and docker-compose + - load data into the galaxy container with the galaxy_data_libs_SI.py script + + :return: + """ + + try: + logging.info("Loading data into the galaxy container") + subprocess.run("../serexec genus_species_galaxy /tool_deps/_conda/bin/python /opt/galaxy_data_libs_SI.py", shell=True) + except subprocess.CalledProcessError: + logging.info("Cannot load data into the galaxy container for " + self.full_name) + pass + else: + logging.info("Data successfully loaded into the galaxy container for " + self.full_name) + + self.get_species_history_id() + # self.get_instance_attributes() + # + # # import all datasets into current history + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["genome_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["gff_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["transcripts_file"]) + # self.instance.histories.upload_dataset_from_library(history_id=self.history_id, lib_dataset_id=self.datasets["proteins_file"]) + + + + def generate_blast_banks(self): + """ + Automatically generate blast banks for a species and commit + + :return: + """ + + + def connect_to_instance(self): + """ + Test the connection to the galaxy instance for the current organism + Exit if it cannot connect to the instance + """ + self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", + verify=False) + logging.info("Connecting to the galaxy instance ...") + try: + self.instance.histories.get_histories() + self.tool_panel = self.instance.tools.get_tool_panel() + except bioblend.ConnectionError: + logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) + sys.exit() + else: + logging.info("Successfully connected to galaxy instance @ " + self.instance_url) + self.instance.histories.create_history(name="FOO") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" + ", following the protocol @ " + "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") + + parser.add_argument("input", + type=str, + help="Input file (yml)") + + parser.add_argument("-v", "--verbose", + help="Increase output verbosity", + action="store_false") + + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + logging.info("Load data: start") + sp_dict_list = parse_input(args.input) + for sp_dict in sp_dict_list: + o = LoadData(parameters_dictionary=sp_dict) + o.main_dir = os.path.abspath(args.dir) + o.modify_fasta_headers() + logging.info("Successfully formatted files headers " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + # o.setup_data_libraries() + # logging.info("Successfully set up data libraries in galaxy for " + o.genus[0].upper() + ". " + o.species + " " + o.strain + " " + o.sex) + logging.info("Load data: done") diff --git a/run_workflow.py b/run_workflow.py index 836e3e8..00e0c82 100644 --- a/run_workflow.py +++ b/run_workflow.py @@ -1,2 +1,464 @@ #!/usr/bin/python # -*- coding: utf-8 -*- + + +import bioblend +import bioblend.galaxy.objects +from bioblend import galaxy +import argparse +import os +import subprocess +import logging +import sys +import yaml +import re +from gga_autoload.gga_load_data import metadata_generator + +""" +deploy_stacks.py + +Usage: $ python3 deploy_stacks.py -i example.yml [OPTIONS] +""" + + +def parse_input(input_file): + """ + Parse the yml input file to extract data to create the SpeciesData objects + Return a list of dictionaries. Each dictionary contains data tied to a species + + :param input_file: + :return: + """ + + parsed_sp_dict_list = [] + + if str(input_file).endswith("yml") or str(input_file).endswith("yaml"): + logging.debug("Input format used: YAML") + else: + logging.critical("Error, please input a YAML file") + sys.exit() + with open(input_file, 'r') as stream: + try: + yaml_dict = yaml.safe_load(stream) + for k, v in yaml_dict.items(): + if k == "config": + pass + parsed_sp_dict_list.append(v) + except yaml.YAMLError as exit_code: + logging.critical(exit_code + " (YAML input file might be incorrect)") + sys.exit() + return parsed_sp_dict_list + + +class RunWorkflow: + """ + Run a workflow into the galaxy instance's history of a given species + + """ + + def __init__(self, parameters_dictionary): + self.parameters_dictionary = parameters_dictionary + self.species = parameters_dictionary["description"]["species"] + self.genus = parameters_dictionary["description"]["genus"] + self.strain = parameters_dictionary["description"]["strain"] + self.sex = parameters_dictionary["description"]["sex"] + self.common = parameters_dictionary["description"]["common_name"] + self.date = datetime.today().strftime("%Y-%m-%d") + self.origin = parameters_dictionary["description"]["origin"] + self.performed = parameters_dictionary["data"]["performed_by"] + if parameters_dictionary["data"]["genome_version"] == "": + self.genome_version = "1.0" + else: + self.genome_version = parameters_dictionary["data"]["genome_version"] + if parameters_dictionary["data"]["ogs_version"] == "": + self.ogs_version = "1.0" + else: + self.ogs_version = parameters_dictionary["data"]["ogs_version"] + self.genus_lowercase = self.genus[0].lower() + self.genus[1:] + self.genus_uppercase = self.genus[0].upper() + self.genus[1:] + self.species_folder_name = "_".join([self.genus_lowercase, self.species, self.strain, self.sex]) + self.full_name = " ".join([self.genus_uppercase, self.species, self.strain, self.sex]) + self.abbreviation = " ".join([self.genus_lowercase[0], self.species, self.strain, self.sex]) + self.genus_species = self.genus_lowercase + "_" + self.species + self.instance_url = "http://scratchgmodv1:8888/sp/" + self.genus_lowercase + "_" + self.species + "/galaxy/" + # Testing with localhost/scratchgmodv1 + self.instance = None + self.history_id = None + self.library_id = None + self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + self.main_dir = None + self.species_dir = None + self.org_id = None + self.genome_analysis_id = None + self.ogs_analysis_id = None + self.tool_panel = None + self.datasets = dict() + self.source_files = dict() + self.workflow_name = None + self.metadata = dict() + self.api_key = "master" + if parameters_dictionary["data"]["parent_directory"] == "" or parameters_dictionary["data"]["parent_directory"] == "/path/to/closest/parent/dir": + self.source_data_dir = "/projet/sbr/phaeoexplorer/" # Testing path for phaeoexplorer data + else: + self.source_data_dir = parameters_dictionary["data"]["parent_directory"] + # Directory/subdirectories where data files are located (fasta, gff, ...) + self.do_update = False + # Update the instance (in histories corresponding to the input) instead of creating a new one + self.api_key = "master" + # API key used to communicate with the galaxy instance. Cannot be used to do user-tied actions + self.species_name_regex_litteral = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # Placeholder re + + + def get_species_history_id(self): + """ + Set and return the current species history id in its galaxy instance + + :return: + """ + histories = self.instance.histories.get_histories(name=str(self.full_name)) + self.history_id = histories[0]["id"] + self.instance.histories.show_history(history_id=self.history_id) + + return self.history_id + + + def create_species_history(self): + histories = self.instance.histories.get_histories(name=str(self.full_name)) + print("\n" + str(histories) + "\n" + self.full_name + "\n") + if not histories: + self.instance.histories.create_history(name="FOO") + print("Created history!") + + + def get_instance_attributes(self): + """ + retrieves instance attributes: + - working history ID + - libraries ID (there should only be one library!) + - datasets IDs + + :return: + """ + histories = self.instance.histories.get_histories(name=str(self.full_name)) + self.history_id = histories[0]["id"] + logging.debug("history ID: " + self.history_id) + libraries = self.instance.libraries.get_libraries() # normally only one library + self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library + logging.debug("library ID: " + self.history_id) + instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) + + folders_ids = {} + current_folder_name = "" + for i in instance_source_data_folders: + for k, v in i.items(): + if k == "name": + folders_ids[v] = 0 + current_folder_name = v + if k == "id": + folders_ids[current_folder_name] = v + logging.info("Folders and datasets IDs: ") + self.datasets = dict() + for k, v in folders_ids.items(): + logging.info("\t" + k + ": " + v) + if k == "/genome": + sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + for k2, v2 in sub_folder_content.items(): + for e in v2: + if type(e) == dict: + if e["name"].endswith(".fa"): + self.datasets["genome_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif k == "/annotation/" + self.genus_species: + sub_folder_content = self.instance.folders.show_folder(folder_id=v, contents=True) + for k2, v2 in sub_folder_content.items(): + for e in v2: + if type(e) == dict: + # TODO: manage several files of the same type and manage versions + if e["name"].endswith("transcripts-gff.fa"): + self.datasets["transcripts_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith("proteins.fasta"): + self.datasets["proteins_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith(".gff"): + self.datasets["gff_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + elif e["name"].endswith("MALE"): + self.datasets["gff_file"] = e["ldda_id"] + logging.info("\t\t" + e["name"] + ": " + e["ldda_id"]) + + + def init_instance(self): + """ + Galaxy instance startup in preparation for running workflows + - remove Homo sapiens from the chado database. + - add organism and analyses into the chado database --> separate + - get any other existing organisms IDs before updating the galaxy instance --> separate + + TODO: move the library and analysis/data stuff to a separate function + :return: + """ + + self.connect_to_instance() + self.get_species_history_id() + histories = self.instance.histories.get_histories(name=str(self.full_name)) + # Create the first history + if not histories: + self.instance.histories.create_history(name=str(self.full_name)) + self.history_id = histories[0]["id"] + logging.debug("history ID: " + self.history_id) + # libraries = self.instance.libraries.get_libraries() # routine check: one library + # self.library_id = self.instance.libraries.get_libraries()[0]["id"] # project data folder/library + logging.debug("library ID: " + self.history_id) + instance_source_data_folders = self.instance.libraries.get_folders(library_id=self.library_id) + + # Delete Homo sapiens from Chado database + logging.debug("Getting 'Homo sapiens' ID in instance's chado database") + get_sapiens_id_job = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"genus": "Homo", "species": "sapiens"}) + get_sapiens_id_job_output = get_sapiens_id_job["outputs"][0]["id"] + get_sapiens_id_json_output = self.instance.datasets.download_dataset(dataset_id=get_sapiens_id_job_output) + try: + logging.debug("Deleting Homo 'sapiens' in the instance's chado database") + get_sapiens_id_final_output = json.loads(get_sapiens_id_json_output)[0] + sapiens_id = str( + get_sapiens_id_final_output["organism_id"]) # needs to be str to be recognized by the chado tool + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_delete_organisms/organism_delete_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"organism": str(sapiens_id)}) + except bioblend.ConnectionError: + logging.debug("Homo sapiens isn't in the instance's chado database") + except IndexError: + logging.debug("Homo sapiens isn't in the instance's chado database") + pass + + # TODO: the following actions should be done in a separate function (in case if the user wants to do everything him/herself -- for EOSC) + # Add organism (species) to chado + logging.info("Adding organism to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_add_organism/organism_add_organism/2.3.2", + history_id=self.history_id, + tool_inputs={"abbr": self.abbreviation, + "genus": self.genus, + "species": self.species, + "common": self.common}) + # Add OGS analysis to chado + logging.info("Adding OGS analysis to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version, + "program": "Performed by Genoscope", + "programversion": str("OGS" + self.ogs_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + + # Add genome analysis to chado + logging.info("Adding genome analysis to the instance's chado database") + self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_add_analysis/analysis_add_analysis/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version, + "program": "Performed by Genoscope", + "programversion": str("genome v" + self.genome_version), + "sourcename": "Genoscope", + "date_executed": self.date}) + self.get_organism_and_analyses_ids() + logging.info("Finished initializing instance") + + + def run_workflow(self, workflow_name, workflow_parameters, datamap): + """ + Run the "main" workflow in the galaxy instance + - import data to library + - load fasta and gff + - sync with tripal + - add jbrowse + organism + - fill in the tripal views + + TODO: map tool name to step id + :param workflow_name: + :param workflow_parameters: + :param datamap: + :return: + """ + + logging.debug("running workflow: " + str(workflow_name)) + workflow_ga_file = self.main_dir + "Galaxy-Workflow-" + workflow_name + ".ga" + if self.strain != "": + custom_ga_file = "_".join([self.genus, self.species, self.strain]) + "_workflow.ga" + custom_ga_file_path = os.path.abspath(custom_ga_file) + else: + custom_ga_file = "_".join([self.genus, self.species]) + "_workflow.ga" + custom_ga_file_path = os.path.abspath(custom_ga_file) + with open(workflow_ga_file, 'r') as ga_in_file: + workflow = str(ga_in_file.readlines()) + # ugly fix for the jbrowse parameters + workflow = workflow.replace('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"UNIQUE_ID\\\\\\\\\\\\"}', + str('{\\\\\\\\\\\\"unique_id\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus + " " + self.species) + '\\\\\\\\\\\\"') + workflow = workflow.replace('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"NAME\\\\\\\\\\\\"', + str('\\\\\\\\\\\\"name\\\\\\\\\\\\": \\\\\\\\\\\\"' + self.genus.lower()[0] + self.species) + '\\\\\\\\\\\\"') + workflow = workflow.replace("\\\\", "\\") # to restore the correct amount of backslashes in the workflow string before import + # test + workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', + "http://localhost/sp/" + self.genus_lowercase+ "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") + # production + # workflow = workflow.replace('http://localhost/sp/genus_species/feature/Genus/species/mRNA/{id}', + # "http://abims--gga.sb-roscoff.fr/sp/" + self.genus_lowercase + "_" + self.species + "/feature/" + self.genus + "/mRNA/{id}") + workflow = workflow[2:-2] # if the line under doesn't output a correct json + # workflow = workflow[:-2] # if the line above doesn't output a correct json + + workflow_dict = json.loads(workflow) + + self.instance.workflows.import_workflow_dict(workflow_dict=workflow_dict) + self.workflow_name = workflow_name + workflow_attributes = self.instance.workflows.get_workflows(name=self.workflow_name) + workflow_id = workflow_attributes[0]["id"] + show_workflow = self.instance.workflows.show_workflow(workflow_id=workflow_id) + logging.debug("Workflow ID: " + workflow_id) + + logging.debug("Inputs:") + logging.debug(show_workflow["Inputs"]) + self.instance.workflows.invoke_workflow(workflow_id=workflow_id, + history_id=self.history_id, + params=workflow_parameters, + inputs=datamap, + inputs_by="") + self.instance.workflows.delete_workflow(workflow_id=workflow_id) + + + def get_organism_and_analyses_ids(self): + """ + Retrieve current organism ID and OGS and genome chado analyses IDs (needed to run some tools as Tripal/Chado + doesn't accept organism/analyses names as valid inputs + + :return: + """ + # Get the ID for the current organism in chado + org = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_organism_get_organisms/organism_get_organisms/2.3.2", + history_id=self.history_id, + tool_inputs={"genus": self.genus, "species": self.species}) + org_job_out = org["outputs"][0]["id"] + org_json_output = self.instance.datasets.download_dataset(dataset_id=org_job_out) + try: + org_output = json.loads(org_json_output)[0] + self.org_id = str(org_output["organism_id"]) # id needs to be a str to be recognized by chado tools + except IndexError: + logging.debug("no organism matching " + self.full_name + " exists in the instance's chado database") + + # Get the ID for the OGS analysis in chado + ogs_analysis = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " OGS" + self.ogs_version}) + ogs_analysis_job_out = ogs_analysis["outputs"][0]["id"] + ogs_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=ogs_analysis_job_out) + try: + ogs_analysis_output = json.loads(ogs_analysis_json_output)[0] + self.ogs_analysis_id = str(ogs_analysis_output["analysis_id"]) + except IndexError: + logging.debug("no matching OGS analysis exists in the instance's chado database") + + # Get the ID for the genome analysis in chado + genome_analysis = self.instance.tools.run_tool( + tool_id="toolshed.g2.bx.psu.edu/repos/gga/chado_analysis_get_analyses/analysis_get_analyses/2.3.2", + history_id=self.history_id, + tool_inputs={"name": self.genus + " " + self.species + " genome v" + self.genome_version}) + genome_analysis_job_out = genome_analysis["outputs"][0]["id"] + genome_analysis_json_output = self.instance.datasets.download_dataset(dataset_id=genome_analysis_job_out) + try: + genome_analysis_output = json.loads(genome_analysis_json_output)[0] + self.genome_analysis_id = str(genome_analysis_output["analysis_id"]) + except IndexError: + logging.debug("no matching genome analysis exists in the instance's chado database") + + + def connect_to_instance(self): + """ + TODO: move in init/access + TODO: password + Test the connection to the galaxy instance for the current organism + Exit if it cannot connect to the instance + """ + self.instance = galaxy.GalaxyInstance(url=self.instance_url, email="gga@sb-roscoff.fr", password="password", + verify=False) + logging.info("Connecting to the galaxy instance ...") + try: + self.instance.histories.get_histories() + self.tool_panel = self.instance.tools.get_tool_panel() + except bioblend.ConnectionError: + logging.critical("Cannot connect to galaxy instance @ " + self.instance_url) + sys.exit() + else: + logging.info("Successfully connected to galaxy instance @ " + self.instance_url) + self.instance.histories.create_history(name="FOO") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Automatic data loading in containers and interaction " + "with galaxy instances for GGA" + ", following the protocol @ " + "http://gitlab.sb-roscoff.fr/abims/e-infra/gga") + + parser.add_argument("input", + type=str, + help="Input file (yml)") + + parser.add_argument("-v", "--verbose", + help="Increase output verbosity", + action="store_false") + + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + logging.info("Start") + sp_dict_list = parse_input(args.input) + + for sp_dict in sp_dict_list: + o = RunWorkflow(parameters_dictionary=sp_dict) + o.main_dir = os.path.abspath(args.dir) + if args.init_instance: + logging.info(" Initializing the galaxy instance") + o.init_instance() + o.get_instance_attributes() + # metadata[genus_species_strain_sex]["initialized"] = True + if args.load_data: + logging.info("Loading data into galaxy") + # o.load_data() + # metadata[genus_species_strain_sex]["data_loaded_in_instance"] = True + if args.run_main: + logging.info("Running main workflow") + o.get_organism_and_analyses_ids() + workflow_parameters = dict() + workflow_parameters["0"] = {} + workflow_parameters["1"] = {} + workflow_parameters["2"] = {} + workflow_parameters["3"] = {} + workflow_parameters["4"] = {"organism": al.org_id, + "analysis_id": al.genome_analysis_id, + "do_update": "true"} + workflow_parameters["5"] = {"organism": al.org_id, + "analysis_id": al.ogs_analysis_id} + workflow_parameters["6"] = {"organism_id": al.org_id} + workflow_parameters["7"] = {"analysis_id": al.ogs_analysis_id} + workflow_parameters["8"] = {"analysis_id": al.genome_analysis_id} + workflow_parameters["9"] = {"organism_id": al.org_id} + workflow_parameters["10"] = {} + workflow_parameters["11"] = {} + + o.datamap = dict() + o.datamap["0"] = {"src": "hda", "id": al.datasets["genome_file"]} + o.datamap["1"] = {"src": "hda", "id": al.datasets["gff_file"]} + o.datamap["2"] = {"src": "hda", "id": al.datasets["proteins_file"]} + o.datamap["3"] = {"src": "hda", "id": al.datasets["transcripts_file"]} + + o.run_workflow(workflow_name="main", workflow_parameters=workflow_parameters, datamap=al.datamap) + # metadata[genus_species_strain_sex]["workflows_run"] = metadata[genus_species_strain_sex]["workflows_run"].append("main") diff --git a/table_parser.py b/table_parser.py index 9e55ecd..9314b91 100755 --- a/table_parser.py +++ b/table_parser.py @@ -1,80 +1,79 @@ -import os -import sys -import pandas # xlrd required for excel files reading -import numpy -import json -import argparse -import logging -from datetime import datetime - -""" -OBSOLETE - -Input parser script. -Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml) -Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2)) -See example toy table (toy_table.xls) - -TODO: move this script inside autoload - -standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)> -""" - - -class TableParser: - - def __init__(self, table_file, dir): - self.dir = os.path.abspath(args.dir) - self.table_file = table_file - self.method = None # TODO: instant launch or just parse (standalone) - self.extension = None - self.meta = dict() - self.json_file = None - - def parse_table(self, extension): - if extension == "xls": - pandas_table = pandas.DataFrame(pandas.read_excel(self.table_file)) - elif extension == "csv": - pandas_table = pandas.DataFrame(pandas.read_csv(self.table_file)) - else: - logging.info("wrong format: input tabulated file cannot be read (supported formats: xls, xlsx, csv)") - sys.exit() - pandas_table = pandas_table.replace(numpy.nan, "", regex=True) - - for char in " ,.()-/": - pandas_table = pandas_table.replace("\\" + char, "_", regex=True) - pandas_table = pandas_table.replace("\\__", "_", regex=True) - pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0" - pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0" - pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0" - pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d") - with open(os.path.join(self.dir, self.json_file), 'w') as json_file: - json_file.truncate(0) - json_content = list() - for organism in range(0, len(pandas_table.index)): - organism_dict = pandas_table.iloc[organism].to_dict() - for k, v in organism_dict.items(): - v = str(v).split(" ") - v = "_".join(v) - v = v.replace("__", "_") - if v.endswith("_"): - v = v[:-1] - json_content.append(organism_dict) - json.dump(json_content, json_file, indent=4) - - def write_json(self, data, filename): - with open(filename, 'w') as f: - json.dump(data, f, indent=4) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Table parser for phaeoexplorer data") - parser.add_argument("input", type=str, help="input table") - parser.add_argument("-d", "--dir", type=str, help="Where to write the output json file that is be used for integration", default = os.getcwd()) - args = parser.parse_args() - - if args.input.endswith("xlsx") or args.input.endswith("xls"): - tp = TableParser(table_file=args.input, dir=args.dir) - tp.extension = args.input.split(".")[1] - tp.json_file = tp.dir + "/dataloader_" + datetime.today().strftime("%Y%m%d") + ".json" - tp.parse_table(extension="xls") +import os +import sys +import pandas # xlrd required for excel files reading +import numpy +import json +import argparse +import logging +from datetime import datetime + +""" +!! OBSOLETE !! + +Input parser script. +Does not work for ods spreadsheets (save as xls or xlsx instead) --> need to handle with pandas_ods_reader (requires ezodf, lxml) +Does not support multiple sheets (TODO: "integration" and "update" sheets (1 and 2)) +See example toy table (toy_table.xls) + +standalone usage: python3 table_parser.py <tabulated_file> -d <directory_to_write_json_to (default: cwd)> + +""" + + +class TableParser: + + def __init__(self, table_file, dir): + self.dir = os.path.abspath(args.dir) + self.table_file = table_file + self.method = None # TODO: instant launch or just parse (standalone) + self.extension = None + self.meta = dict() + self.json_file = None + + def parse_table(self, extension): + if extension == "xls": + pandas_table = pandas.DataFrame(pandas.read_excel(self.table_file)) + elif extension == "csv": + pandas_table = pandas.DataFrame(pandas.read_csv(self.table_file)) + else: + logging.info("wrong format: input tabulated file cannot be read (supported formats: xls, xlsx, csv)") + sys.exit() + pandas_table = pandas_table.replace(numpy.nan, "", regex=True) + + for char in " ,.()-/": + pandas_table = pandas_table.replace("\\" + char, "_", regex=True) + pandas_table = pandas_table.replace("\\__", "_", regex=True) + pandas_table.loc[pandas_table["genome version"] == "", "genome version"] = "1.0" + pandas_table.loc[pandas_table["ogs version"] == "", "ogs version"] = "1.0" + pandas_table.loc[pandas_table["version"] == "", "version"] = "1.0" + pandas_table.loc[pandas_table["date"] == "", "date"] = datetime.today().strftime("%Y-%m-%d") + with open(os.path.join(self.dir, self.json_file), 'w') as json_file: + json_file.truncate(0) + json_content = list() + for organism in range(0, len(pandas_table.index)): + organism_dict = pandas_table.iloc[organism].to_dict() + for k, v in organism_dict.items(): + v = str(v).split(" ") + v = "_".join(v) + v = v.replace("__", "_") + if v.endswith("_"): + v = v[:-1] + json_content.append(organism_dict) + json.dump(json_content, json_file, indent=4) + + def write_json(self, data, filename): + with open(filename, 'w') as f: + json.dump(data, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Table parser for phaeoexplorer data") + parser.add_argument("input", type=str, help="input table") + parser.add_argument("-d", "--dir", type=str, help="Where to write the output json file that is be used for integration", default = os.getcwd()) + args = parser.parse_args() + + if args.input.endswith("xlsx") or args.input.endswith("xls"): + tp = TableParser(table_file=args.input, dir=args.dir) + tp.extension = args.input.split(".")[1] + tp.json_file = tp.dir + "/dataloader_" + datetime.today().strftime("%Y%m%d") + ".json" + tp.parse_table(extension="xls") diff --git a/templates/compose-template.yml b/templates/compose-template.yml index 590923c..b3b8578 100755 --- a/templates/compose-template.yml +++ b/templates/compose-template.yml @@ -81,7 +81,7 @@ services: galaxy: image: quay.io/galaxy-genome-annotation/docker-galaxy-annotation:gmod volumes: - - ../galaxy_data_libs_SI.py:/opt/setup_data_libraries.py + - ../galaxy_data_libs_SI.py:/opt/galaxy_data_libs_SI.py - ./docker_data/galaxy:/export - ./src_data/:/project_data:ro #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... diff --git a/templates/stack-organism.yml b/templates/stack-organism.yml index 519b96f..103757f 100644 --- a/templates/stack-organism.yml +++ b/templates/stack-organism.yml @@ -112,7 +112,7 @@ services: galaxy: image: quay.io/galaxy-genome-annotation/docker-galaxy-annotation:gmod volumes: - - ../galaxy_data_libs_SI.py:/opt/setup_data_libraries.py + - ../galaxy_data_libs_SI.py:/opt/galaxy_data_libs_SI.py - ./docker_data/galaxy/:/export/ - ./src_data/:/project_data/:ro #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... @@ -130,7 +130,7 @@ services: GALAXY_DEFAULT_ADMIN_USER: "gga" GALAXY_DEFAULT_ADMIN_PASSWORD: "password" GALAXY_CONFIG_ADMIN_USERS: "admin@galaxy.org, gga@sb-roscoff.fr, lgueguen@sb-roscoff.fr, alebars@sb-roscoff.fr" # admin@galaxy.org is the default (leave it), gogepp@bipaa is a shared ldap user we use to connect - GALAXY_CONFIG_MASTER_API_KEY: "dev" + GALAXY_CONFIG_MASTER_API_KEY: "master" ENABLE_FIX_PERMS: 0 PROXY_PREFIX: /sp/genus_species/galaxy GALAXY_TRIPAL_URL: http://tripal.genus_species/tripal/ diff --git a/ext_scripts/__init__.py b/utils/__init__.py similarity index 100% rename from ext_scripts/__init__.py rename to utils/__init__.py diff --git a/ext_scripts/blastdb.py b/utils/blastdb.py similarity index 100% rename from ext_scripts/blastdb.py rename to utils/blastdb.py diff --git a/ext_scripts/common-stringSubsitute.py b/utils/common-stringSubsitute.py similarity index 97% rename from ext_scripts/common-stringSubsitute.py rename to utils/common-stringSubsitute.py index c32a177..c4d22a9 100755 --- a/ext_scripts/common-stringSubsitute.py +++ b/utils/common-stringSubsitute.py @@ -1,37 +1,37 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import argparse -import os -import re -import sys - -# Return the file obtained by replacing the occurrences of pattern by the replacement string. -#Â Use of python method re.sub() -# python common-stringSubsitute.py -f file -p pattern -r replacement_string -# ex : python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)' - -if __name__ == '__main__': - - #Get arguments - parser = argparse.ArgumentParser(description="Return the file obtained by replacing the occurrences of pattern by the replacement string. Use of python method re.sub(). Example: python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)'") - parser.add_argument('-i','--infile', help='Input file', required=True) - parser.add_argument('-o','--outfile', help='Output file', default='outfile') - parser.add_argument('-p','--pattern', help='Pattern string to be replaced', required=True) - parser.add_argument('-r','--repl', help='Replacement string', required=True) - args = parser.parse_args() - - infilename=args.infile - outfilename=args.outfile - pattern=args.pattern - repl=args.repl - - infile=open(infilename,'r') - outfile=open(outfilename,'w') - - lines=infile.readlines() - - for line in lines : - line_out=re.sub(pattern,repl,line) - outfile.write(line_out) - +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import os +import re +import sys + +# Return the file obtained by replacing the occurrences of pattern by the replacement string. +#Â Use of python method re.sub() +# python common-stringSubsitute.py -f file -p pattern -r replacement_string +# ex : python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)' + +if __name__ == '__main__': + + #Get arguments + parser = argparse.ArgumentParser(description="Return the file obtained by replacing the occurrences of pattern by the replacement string. Use of python method re.sub(). Example: python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)'") + parser.add_argument('-i','--infile', help='Input file', required=True) + parser.add_argument('-o','--outfile', help='Output file', default='outfile') + parser.add_argument('-p','--pattern', help='Pattern string to be replaced', required=True) + parser.add_argument('-r','--repl', help='Replacement string', required=True) + args = parser.parse_args() + + infilename=args.infile + outfilename=args.outfile + pattern=args.pattern + repl=args.repl + + infile=open(infilename,'r') + outfile=open(outfilename,'w') + + lines=infile.readlines() + + for line in lines : + line_out=re.sub(pattern,repl,line) + outfile.write(line_out) + outfile.close() \ No newline at end of file diff --git a/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh b/utils/phaeoexplorer-change_pep_fasta_header.sh similarity index 96% rename from ext_scripts/phaeoexplorer-change_pep_fasta_header.sh rename to utils/phaeoexplorer-change_pep_fasta_header.sh index 0de7b9b..3cf614f 100755 --- a/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh +++ b/utils/phaeoexplorer-change_pep_fasta_header.sh @@ -1,17 +1,17 @@ -#!/usr/bin/env bash - -INFILE=$1 -OUTFILE=tmpfile - -FILE_HEADER_START=$(grep ">" $INFILE | cut -c 1-6 | sort | uniq) -HEADER_START_STRING=">mRNA." - -if [[ "$FILE_HEADER_START" == "$HEADER_START_STRING" ]] -then - /usr/local/genome2/mmo/scripts/common/common-stringSubstitute.py -i $INFILE -o $OUTFILE -p '^>mRNA' -r '>protein' - mv $OUTFILE $INFILE - echo "'>mRNA' replaced by '>protein'" -else - echo "Abort. Not all headers start with '>mRNA.':" - echo "$FILE_HEADER_START" +#!/usr/bin/env bash + +INFILE=$1 +OUTFILE=tmpfile + +FILE_HEADER_START=$(grep ">" $INFILE | cut -c 1-6 | sort | uniq) +HEADER_START_STRING=">mRNA." + +if [[ "$FILE_HEADER_START" == "$HEADER_START_STRING" ]] +then + /usr/local/genome2/mmo/scripts/common/common-stringSubstitute.py -i $INFILE -o $OUTFILE -p '^>mRNA' -r '>protein' + mv $OUTFILE $INFILE + echo "'>mRNA' replaced by '>protein'" +else + echo "Abort. Not all headers start with '>mRNA.':" + echo "$FILE_HEADER_START" fi \ No newline at end of file diff --git a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh b/utils/phaeoexplorer-change_transcript_fasta_header.sh similarity index 100% rename from ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh rename to utils/phaeoexplorer-change_transcript_fasta_header.sh diff --git a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh.bak b/utils/phaeoexplorer-change_transcript_fasta_header.sh.bak similarity index 97% rename from ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh.bak rename to utils/phaeoexplorer-change_transcript_fasta_header.sh.bak index 196675b..12ce4e5 100755 --- a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh.bak +++ b/utils/phaeoexplorer-change_transcript_fasta_header.sh.bak @@ -1,7 +1,7 @@ -#!/usr/bin/env bash - -INFILE=$1 -OUTFILE=tmpfile -/home/fr2424/sib/alebars/gga_load_data/ext_scripts/common-stringSubsitute.py -i $INFILE -o $OUTFILE -p '^>\d+ mRNA' -r '>mRNA' -mv $OUTFILE $INFILE -echo "'>[0-9]+ mRNA' replaced by '>mRNA' in $1" +#!/usr/bin/env bash + +INFILE=$1 +OUTFILE=tmpfile +/home/fr2424/sib/alebars/gga_load_data/ext_scripts/common-stringSubsitute.py -i $INFILE -o $OUTFILE -p '^>\d+ mRNA' -r '>mRNA' +mv $OUTFILE $INFILE +echo "'>[0-9]+ mRNA' replaced by '>mRNA' in $1" -- GitLab