From b33e6219690c42c74822d28d59144592ef96a616 Mon Sep 17 00:00:00 2001 From: Arthur Le Bars <arthur.le-bars@sb-roscoff.fr> Date: Thu, 9 Apr 2020 16:07:21 +0200 Subject: [PATCH] tidying up the repo --- README.md | 41 ++- autoload.py | 30 +- examples/example.json | 27 ++ examples/example.xlsx | Bin 0 -> 5651 bytes ext_scripts/__init__.py | 0 ext_scripts/blastdb.py | 298 ++++++++++++++++++ ext_scripts/common-stringSubsitute.py | 37 +++ .../phaeoexplorer-change_pep_fasta_header.sh | 17 + ...explorer-change_transcript_fasta_header.sh | 8 + templates/compose-template.yml | 229 ++++++++++++++ templates/stack_template.yml | 286 +++++++++++++++++ 11 files changed, 946 insertions(+), 27 deletions(-) create mode 100755 examples/example.json create mode 100755 examples/example.xlsx create mode 100755 ext_scripts/__init__.py create mode 100755 ext_scripts/blastdb.py create mode 100755 ext_scripts/common-stringSubsitute.py create mode 100755 ext_scripts/phaeoexplorer-change_pep_fasta_header.sh create mode 100755 ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh create mode 100755 templates/compose-template.yml create mode 100755 templates/stack_template.yml diff --git a/README.md b/README.md index e3a69ad..74df877 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# gga_load_data +# gga_load_data (WIP) Automated integration of new organisms into GGA instances. @@ -8,10 +8,23 @@ As input, the script either takes a tabulated file (xls, xlsx or csv) or a json For each organism to be integrated, the script needs at least its genus and species (strain, sex, genome and annotation files versions are optional, but the two later will be set to the default version of 1.0, and the two former will be set as empty and will not being considered during the integration process). See toy datasets (input_toy.json and input_toy.xlsx) for an example of what information can be described and the correct formatting of these input files. The script should then take of everything (for phaeoexplorer organisms), from generating the directory tree to running workflows and tools in the galaxy instance. -## Metadata files (in progress): +## TODO: +- ready the script for production (add usage arguments): remove dev args for master merge +- metadata +- search and link source files to src_data +- call the scripts for formatting data, generate blast banks +- nginx conf editing (+ master key in docker-compose) +- set master key +- user password input + store hash + +## Metadata files (WIP): The script also generates a metadata file in the directory of the newly integrated species, summing up what actions were taken for this organism (see meta_toy.yaml for the kind of information it can contain). It also creates another metadata files in the main directory (where you put all the organisms you have integrated), which contains the sum of all metadata files from all integrated organisms. These metadata files are also updated when updating an existing instance. +## nginx conf (WIP): +The default.conf will be automatically generated (automatic port affectation), APIs will be able to bypass authentication (for bioblend, a master key +is set at the creation of the docker-compose.yml of the organisms) + ## Directory tree: For every input organism, the script will create the following directories structure, or try to update it if it already exists. It will update the files in the main directory to account for the new organisms that are getting integrated. @@ -22,41 +35,41 @@ It will update the files in the main directory to account for the new organisms |---/genus1_species1 | | | |---/blast -| | |---/<links class="yml"></links> -| | |---/<banks class="yml"></banks> +| | |---/links.yml +| | |---/banks.yml | | | |---/nginx | | |---/conf -| | |---/<default class="conf"></default> +| | |---/default.conf | | | |---/src_data | | |---/genome | | | |---/genus1_species1_strain_sex | | | |---/vX.X -| | | |---/<genus_species_vX class="X fasta"></genus_species_vX> +| | | |---/genus_species_vX.X.fasta | | | | | |---/annotation | | | |---/genus1_species1_strain_sex | | | |---/OGSX.X -| | | |---/<OGSX class="X gff"></OGSX> -| | | |---/<OGSX class="X_pep fasta"></OGSX> -| | | |---/<OGSX class="X_cds fasta"></OGSX> +| | | |---/OGSX.X.gff +| | | |---/OGSX.X_pep.fasta +| | | |---/OGSX.X_transcripts.fasta | | | | | |---/tracks | | |---/genus1_species1_strain_sex | | | |---/apollo -| | |---/<annotation_groups class="tsv"></annotation_groups> +| | |---/annotation_groups.tsv | | -| |---/<docker-compose class="yml"></docker-compose> +| |---/docker-compose.yml | | -| |---/<metada_genus1_species1 class="yml"></metada_genus1_species1> +| |---/metada_genus1_species1.yml | -|---/<metadata class="yml"></metadata> +|---/metadata.yml | |---/main_proxy |---/conf - |---/<default class="conf"></default> + |---/default.conf ``` diff --git a/autoload.py b/autoload.py index be7ceac..031fa04 100644 --- a/autoload.py +++ b/autoload.py @@ -41,12 +41,9 @@ STEPS: class Autoload: """ - The Autoload class possesses most useful parameters to interact with GGA as attributes (as defined in __init__), so new - methods can be more easily implemented by copying already existing ones (i.e add new analysis, run a workflow, ...) + The Autoload class has attributs to interact with GGA as attributes - To run the workflows, place them in the same directory as this script, and add the method + the workflow - parameters in the main invocation (at the end of the file) - TODO: store main workflow as string? + TODO: store default main workflow as string? """ def __init__(self, species_parameters_dictionary, args): @@ -109,27 +106,34 @@ class Autoload: def get_source_data(self, max_depth): """ + NOT PRODUCTION READY + find and copy source data files to src_data directory tree - recursively search for the correct files (within a fixed max depth) - requires the organism src_data directory tree to already be properly created for the organism (run generate_dir_tree) - the source files must have "transcripts", "proteins"/"pep", "genome" in their name, and a gff extension for the ogs file """ - src_data_dir = os.path.join(self.species_dir, "/src_data") # to limit rewriting of the src_data directory - # some regex bullshit + src_data_dir = os.path.join(self.species_dir, "/src_data") + - sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" + sp_regex = "(?=\w*V)(?=\w*A)(?=\w*R)(?=\w*I)(?=\w*A)(?=\w*B)(?=\w*L)(?=\w*E)\w+" # TODO: improve regex for dirpath, dirnames, files in os.walk(self.source_data_dir): for name in files: print("foo") def regex_generator(self, organism_name_pattern): + """ + + """ re_dict = dict() + re_dict["gff"] = None + re_dict["transcripts"] = None + re_dict["proteins"] = None + re_dict["genome"] = None + for char in organism_name_pattern: - re_dict["gff"] = None - re_dict["transcripts"] = None - re_dict["proteins"] = None - re_dict["genome"] = None + def generate_dir_tree(self): """ @@ -535,7 +539,7 @@ if __name__ == "__main__": ", assuming the instances for the organisms are already generated and initialized", action="store_false") parser.add_argument("--dir", - help="Path of the main directory, either absolute or relative", + help="Path of the main directory, either absolute or relative, defaults to current directory", default=os.getcwd()) args = parser.parse_args() diff --git a/examples/example.json b/examples/example.json new file mode 100755 index 0000000..60e8f59 --- /dev/null +++ b/examples/example.json @@ -0,0 +1,27 @@ +[ + { + "genus" : "genus1", + "species" : "Species1", + "common" : "Common1", + "strain" : "strain1", + "sex" : "male", + "version" : "1.0", + "performed by" : "Institute John Doe", + "genome version" : "1.0", + "ogs version" : "1.0", + "date" : "2020-01-01" + } + { + "genus" : "genus2", + "species" : "Species2", + "common" : "Common2", + "strain" : "strain2", + "sex" : "female", + "version" : "1.0", + "performed by" : "Institute Jane Doe", + "genome version" : "1.0", + "ogs version" : "1.0", + "date" : "2020-01-01" + } +] + diff --git a/examples/example.xlsx b/examples/example.xlsx new file mode 100755 index 0000000000000000000000000000000000000000..8530103dd049d58ff2f90dd1fe7799e427ba8218 GIT binary patch literal 5651 zcmaJ_1z1#Tw+3bq7zBncX^`%cMvy_eyJKLG7)lxx2Bnd1l<pExx<MEuMM>#UL_j1J zggfIo|JCb%&sqDKZ_j?#x7L34>hIN70|3b|aB*=lcATTMF@7U_)PE~asEZdrA9}4$ z8dn7ff+O~WqZ0ysUy|RkZ|I1e$i5r=Fk|G>Mm$r1`hIXA0@t*>80U#f$l<8CV%8#w z!*O(uax?^9VagpFe?b*k_>Ay$hqlAnXq08lP?eR)Zn)xYzsnJ0k{f0{t-!3gPOq`0 z<eGKyjg|=eMf<>TR|_k^mHo$)$nujV*wT5NVZ>>8dR_SrB*J&Y5(^?Rxn@+5z<bI8 z;TLWv)*8Q~XOKsu+2+FU`z=rtmt)%O`AVcmZnJ(vY)038>6Qr3TshJY;Wd|D63w`f zCztI>8r!d7@46@1VR9JSYB;zK6=6*)sLz(d!azO$lVONaJM7_h+MaNCFMd0?CzQ|6 z)#Z_{g%(19EcnzYp;HmRY!0{1KI_{W38|w<=R42AibfGv@s|(ap*<yh2fI%LHOGJ4 zlU#@yUI-OyPoo9_!lV<b_?mfykLpd&NH?g}ptmbc6&@2>Pw;f9W~v%jud2*|8aJ(_ z9a%Mc^&g8(R{-t$#2Cx)aug&zEu3#xBU<~T>-kNw;&EFBuy}aOikn4`-hGUWPs66= zeP?Ktd%H(KpHbcv(Z|@&UX9H=GJr$pRYLZjp3FRjaHDLARu+_9f|u2t+<K{a@iHiA z4|3e%Vl;rUx&1sb+$oO&G)~PL+!E>^x4&nEpQ6mlEE{#<Ue7$mN=T}(q<66s>^wX8 zV$AFWXm0+{CX0K=Z8lksgwRMo&8+FmjiM`oeO9{)8K_d6MQdTL1lb|tusMNNl4S6G z7mLNYeJcn)pIGqGrKl&bvdU&~p(}lGs^(MV!pqT<b<&JoN1D(W-X`*+ZKjJurb9T| zP0+b3b{HlG#`}MD6Zm&G*|@u-y_B0Us@W$14&RqnviETPoHnh(HAqgf77mP%Z&8Zx zaEn`G7$|9kKfx^D&0L6gI3C@r8=Z_Fz$}5aJ5RQ=mSM1W2)WG^%<Op>7Bb&lO`+;Q zP~<We5Omq_Cld*<0iF*5Da{(SII9HXJ&6qag%<dv=CurqcuMI`Olk3mEJIA)@%VA` zsp9c+utf148NajC5N&s9Z*GnUA5=>{8d<Lb&_WU)3I|b3J?o#P#?K*1ifgbxWr~GF zr%0`8Ib^JS69-OtS}vu=EL3|8KNz^|D3P(CFyamlSqIqniW(^B3BeTPW3oUmObzSq zJrs#1R5VbT7f~R*ZWSRIw4M3@fHt75*{K@<WxyQN@1NtG3S~e)R4&0ioxNaCsJ9pY z&zm6HkOO8GUh}uef|spU4-}vI2%g0hVN2qLE8I=iOm=fT(dc3JyJuFzlZULeQW0ZF z+ql&*mHHK2=N+_vEHi_HT$egK<JPF+?N^Ge&tKf}cExv4BO92Tnz%d>_^~tc5`<Nc z%gTXJJg25O8JuK5(Dlkii-z|XT&uT*)#4$Sr8J+L_mPp&rS!#)Re?1z^CQlXoSxTj z_iL$=4(|FWvSWKVq*E-dH_ggd8nlO#Hoh+sU@0cPPAU&O8;~5s4-{FU@3eL%X9up0 za}?Or%InHYrX$$>h>%S=<H<Njj|e}P=1z8(Ytj=GWCU27JQpx0*o|2_S6a3M$Dc_( zZd==EGpL$$Qw1!t=FKjwBc8e6SnQ`RDg?qSdTL?f5M*o#!r{_;jUjT<24hgmnP!c^ zJQ1OSMZa%@l@5fIepqz9tC0I%uJj3q1Xbh;$c3a=A#V@7^aSqIuP9~buTiFwB1~zQ z!TMR!)frjmbx-cdbq72DC<KortGHngxymbNMrA+OjyVqNOIFw^S93Hemu5af5C~RU zydv(pmbQ4KuD-&yj#pl}sV){ezw?wZ<%8*fmL_htH29VJvyf#y;mu;jmKvekhq4Hs zN)eGJMYZ!sTY+-AP^#4O_E577ZRt~gNr<qCB|^_jkY=Kh{6bF68tZGAjy|=88z}6} zzQ>*EK$2+>vL5yAv~%?yej2tPvBH-o362mfHJH<@LrMDDq2--7)#k0B^pJ@I#Pa@| zO7j3vdT@oxRiDB4R5h#Bq_k_*7fmfyua+PVGXz{sp~J4Vq|NOmuT|GJMKOV~*Qj=R zMk@2XEh!c!r7Axhj<t{}M=eqMWzutsZG*NS3B%Pvp!mEGLN<%>tpLZ@Yn9-1p#mf2 zSZKJJvbue^k+L}ww<|tlI5IE(jFqKV()K3H_xt_kkeqkKGeT?a@x~lNKI#>U{;~<X zpV$^s0jQVj%L9c5L6O~CcQA=Rw(H6RWlKSk@*himhN|d<;1T9ciWId9_ZkBOoK82o z!|S{B<-=PUxEU2zwd^Cg%aEzP(RbUhnBOcBX1-9|+D+K;8Co;A8|P!izuP(T*sqD3 zac!~YVWcl=&(ba^@^0JpL>N3`WS;5JXXw6^{Q*}IBrq@iL|*d+_`Dd48MoIMlNe}I z@DMl5olUW*%>t9y>Cf$D=_lHjwo%*?*mul_IT_bXH8F`Py@c+xE<hvi1WMUY6J|Ck z)+{i^7%_>?pSQwl{bhvLj^9Tbo%DByo4HCgw3@jFhNBkex9Ft^wRBD1Z<-Hb`Q$bV z)7Q$rYw$#DtQmHCww`9nIPYBv+GuA28)YXMoL{-=F=joJZ0RVG_m(5}$i|v%SN26c zi0so?ba8@5&#A;=pgV`d^8_eXNudGb4-}8Pa-pcI#sPXiJLTO}Rn<~dy`k3Itj6i7 zsuz}8plX$1nbA)>ExIs`fX|sGPXZjK3!i2lmpMAfeERt)Hu^Nvq5<wO^ErD?^xVN` zw39mZ>u6aD98w%4%(a=V!?_%*`prR~OQ29UlsXkOQl@VWDaI1!BG1q%?~YZ){>2(C z(}zKd6;P}!o$`rT)ki3nNTF^%O6o7xcL)6nlwS0$)}%NmS^Tlr4aW_>55dIKG+5IU z9xm#Bl45L;ET>;q{Nvm&!i2Dr1V?cD+4tq)Og9g7pSllI#EfIe4ZQW}HKT0Jy)=<4 z`%WDn0o-g~XLUc1kP9G{WmfRY%Q86(i7E^wSJ>E?m1y*J@#Myl=s!Kc-+(idK>+}` zvhhGOW0lUTPqOUSsoi?=qJY9SBRYB-Y?qkKj=uN4;XZSHG!DKZfE3yrQ@}5K$@#e~ z>H?8?9pONp2j=#Zr}xzf7ugKMdsJ8kNp2#kimT#^tvjfw=@!V`xgbsanE=XUX@~K; zHf54{7PXW?_p8`O^_diQWc1p*Xp&Uoy%nmdkTbsMnwNvBHwd)JQh#t3`ZA8n#eTc; zl@pH)@{80nHg0lv3|8VCy5uptJuE@E({izHyK<vi00HH>LNcmF4Rk$NHRRR<d^qCT zUd`usI^zG?ir@)mO4l$j7DWE76%qW^ioCr2U7%j*{<CXrp|yY-x0Y+`E@YF^bgfkp zRC=Wy>M$}qr+I`*PJb^$;;uqp?ehJcBD_Z$T*=d_!S(w{nL+lMnTqU>wrsZoisl8_ z9R0k>Z0|q|bZ;I~cUp|dJ+-mtN#tBreM4PS)<by^F*F-Jl^v!VBY!)Kny-pi4m(r@ z#1$!_%a;0uRnHWRpFalByQiwVem4wfT_ERLm_XF}UA_TxV+6^I6bct_@YlX3vhHml z3%_r%Zr#^Z)e5}9+c&V|<ZnP#Ywq8i1$fv$on%>MBx&_l0UjR|J6JNYgdGec>~L7I zC$BAyO?^^oe@hXzIySM>lN#l*l<E{K16F2xo_0?n0z>pr$v~l+&80KF{O~d<*I762 z6}yarOn2+4YUzZ_kGK=CDm5~d_2OmDYOK4W2Trh#Rz3E5<kU!5orC`>q;j}B%D57! zZW?bk&+SWUESsOR+&tFtdK%6U_K|Dhvxo_fJ7dnN!>81&hbe5eTf<St9Yq9RS~RKU z({na;Cn%$4Oii+8zpuP=zmX_LOs?ACnBYU3As)9kRt}9qT%6n5Zx3H;r}>e8IVw_V z!KmYAPIl@LkQRRA@YN=Nt19q1luL>>vT`G|q#7AkEWZoAR{2?}xc7c8WMR7e;9I?R zaI;nUOM;WZku&Qp0%5af)ridhT&^SP@xI&Xo<j4S;>o$5!b@!8AFk%qC%QV%Uk6>$ zckPOXq`FG(pZk{{2e(aimAYKH^_}M3yFK-=qbt=ySkr?@X^-gp^#{Joe15g=r)sZw zlVw3-ct)zYZ?>LjUJG9r$NT7bR>`7)Q?y-KR6MPJ(_m17&i&?twd>Q8v|lu>)-CjM zyXn1C4lxHF2pCh6h8;VWoQb0YT0xyRIt?|LR+9W%Kok9%t!&|NXLLx%=^@<~1i&4q z+`uJQUdwy5X#k`61dK&+DKaw&Lg5@uHgu3yaJ57XE#{n8BqDKK&lbl|7yn=ype$er zBX4l05+KsA!O!;*JU_`GbQozEGUzEb1UKLt`!obJNjo)+H!cNwTg_z{M2_~Jx?0{I z-PIP)8`8U{=f(Q&bI=0S%A;zGy0j27pkCLO!gE3!n@b$U^x`3#!b3#@kQ5k@#PYl; z6~<%6tH!ii&ONlQahN%CM`~5b@$ysZiP1V#dYK_BrNGY6(<u~GeYCB1Lg!f&-JCMQ zeM{jM(W~Gg1~3h0(wl_mqSAP`=9VuF???L1M^aWNymobKOc7M{YIa~L;Q0E$|D|go zy}v`?`Ztp@g=e-9me%$}EzKYbY(tUmZJ5}et1qJu`70NSF_tZ+ar$}4fD6gQ3tg%h zz9_|2eIzl5Ri)1D%k1d)Bj2a=YVvJfC%ru{NKT|7Zz!x|PQEkE1C$sECYGq959Jyy zKgR0O4k{puu8-iQ18lNefcvv1YA$D5%#R7~ZuG;cfHwH}ju)`W+!?l{tc@-TmFN-q zCP}xL3||3e2e~D!LMNeY2m(eIf6c~EiaLlYjva9%rcv&QyqQ@l=Tvy(Q&?wFA$)S; z(C?vVV1D`2wp~*xRavF`4+cl}(`}CN(E%gNgLrv_8pNFd{}wQ0e^)@5jVIJz&)d_< z&GBa;Oik6&0x*FG_Cr9ui*Fu?mCP-^%6RK{Q1D8)eu@_<#Mmdc+mvprE}glGdqK6d z4$Yg(UF6UNBLiGEZN<{DCeD`DCWcR&e(ZLTrHGsa9N|{CV<0Vjr@(B9P0TO8X6Vjx zDQ)3uPL1Gnw|jn>+)jz`&?aO4_%t2^8Fx?O89Zf*qF5lL_wDc~BpPpb2KYpdwDQGg zntBU8!a3cEo6mA9!TqE|ZWuXbI_1u9Q@=+PVZ6dh))1_;&6k}IjUvf0$vGJFw?twb z1awTqjN>~icJlA9h&w**3#~Y_WVa3#RZ<&^$;!)gSIHXOD}?klY#j{&tmw~9(nU`) z-Jg%!$<?!4o|C<(q<Nv<T(<eVHgur}$GX%Yw%R#XaMd<;P?lY$J}|hT?QgzT0>gug zLZi*Ag7a%=`xCC8|Byx5-xbw`{`oLO^Oybq0z}Y9qW`4rU*JfhhI605^~d`^$RVc# zDmZqrw>n~P*Zw%ee5JEC(Jz)X{o*QwkJbzX{XCI8F>}~?_xO`Y+^5KX#p~B93V@G# z27GWkLVxV-u2LJ07A&p6*q+oekW{vl&OW46;WuqKyk){?+nXR|QG`_0ET2KPB6fB7 zUw>7Z%E25Z*Q+)IG`ewXw|Vtm)!EGu5Ly)mJ`)%sz#H|XPZXZiePX-9%`b#=<oaz> zW>b!tZkA=Z7V%71%op#Sl>_~L06rXVp;|lS^9AeS*QQLR>e{8nAS|?Hw7z@`c!T<G zwtqcmlcIK*DZ<^np>Ez*`u^@vFAH=ztJX)NkJ_km#%!hYISrt|;Y7rvTy{r-#ZHT4 z$}e#@?PQ;t9oK-PQE9_|P8go)I5%Khy5NE@%i*jlRH&B}N^Hobm%af#POwTI2-2-O zAyjzWaWkbTYI0G4;tn7S=N=tZmHxCdy~mF8R<rRT&FrRkz(BYtvHFPJ8sp&_Ay=o9 zoljTax+Uc*YuoO5lD<l_S2c;WZDuhSZNERPk9cKzh+*$5zav?qg%&IYpR$YlmfjUu zTT&x!EP{BS@G+AGrl1y8lMzWh+v!1~vx{}D6A485spDN0)8|Li*zFWVM}A+Fk4Pka z?*e7HZfOyhU-7{zRq^EG;~=<?$QptEB_}mdhUr>G02r{7HrFKs{1Ey&UcKok`zrhi z_>?z1aUk%GFo|D-kIYLS)AFt1oUHa$`VFt6v+`K_Zl+T_h94|a>UJAA;+)2(I%9H} zXs=vMA$$x_@oI#Pj#o@9GK^pN-R}Sv8o&F$^c$M@r{M393L4+~MbxNQ{TJHxr}*!H z0vdPuMct^_`CswBQJDXd{%r<CgATt46Zp>%#GhJzZ@lPv@fXE{erowOJ^m^BdkIE& z=3iui()2e5@}GqNQ~3A#fv(cOC?2H?CG7QQ&Hhuv?|Fu<Grx!s<qMRCf2cZt>iIn~ g&{_G5I8ejL|C6KIYS^fvhk=2Iy5vxf>ijA7Kb(G+UH||9 literal 0 HcmV?d00001 diff --git a/ext_scripts/__init__.py b/ext_scripts/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/ext_scripts/blastdb.py b/ext_scripts/blastdb.py new file mode 100755 index 0000000..2794ccb --- /dev/null +++ b/ext_scripts/blastdb.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import argparse +import collections +import json +import logging as log +import os +import sys + +from shutil import copyfile +from subprocess import call + + +class BlastBank: + + def __init__(self, raw_org, data_dir_root, rel_path, fasta_file_name, db_dir_root, seq_type, path, is_multi): + self.raw_org = raw_org + self.org = prettify(raw_org) + self.data_dir_root = data_dir_root + self.rel_path = rel_path + self.fasta_file_name = fasta_file_name + self.db_dir_root = db_dir_root + self.seq_type = seq_type + self.path = path # http://bipaa.genouest.org/sp/xxx/ Can be the same as raw_org, or something else when having multiple genomes. + self.is_multi = is_multi + + self.fasta = os.path.join(data_dir_root, rel_path, fasta_file_name) + self.dest_path = os.path.splitext(os.path.join(db_dir_root, self.path, rel_path, fasta_file_name))[0] + self.title = sanitize(rel_path + '_' + os.path.splitext(self.fasta_file_name)[0]) + if self.is_multi: + fake_path = rel_path.split('/') + if len(fake_path) > 2: + fake_path = [fake_path[1]] + [fake_path[0]] + fake_path[2:] + fake_path = '/'.join(fake_path) + self.pretty_name = prettify(fake_path, True) + else: + self.pretty_name = self.org + ' ' + prettify(rel_path, False) + + with open(self.fasta, 'r') as f: + self.first_id = f.readline()[1:].rstrip() + + if self.seq_type == 'nucl': + if 'transcript' in self.fasta_file_name.lower() or 'cdna' in self.fasta_file_name.lower(): + self.pretty_name += " transcripts" + elif 'cds' in self.fasta_file_name.lower(): + self.pretty_name += " CDS" + else: + if 'protein' in self.fasta_file_name.lower() or 'pep' in self.fasta_file_name.lower() or 'proteome' in self.fasta_file_name.lower() or self.fasta_file_name.endswith('.faa'): + self.pretty_name += " proteins" + + # Just a stupid/hacky string used for sorting bank list + self.sort_key = 'a_' if 'genome' in self.title else 'b_' + self.sort_key += self.pretty_name + + def __str__(self): + return str({ + 'raw_org': self.raw_org, + 'org': self.org, + 'data_dir_root': self.data_dir_root, + 'rel_path': self.rel_path, + 'fasta_file_name': self.fasta_file_name, + 'db_dir_root': self.db_dir_root, + 'seq_type': self.seq_type, + 'path': self.path, + 'fasta': self.fasta, + 'dest_path': self.dest_path, + 'title': self.title, + 'pretty_name': self.pretty_name, + }) + + +def main(args): + + genome_path = os.path.basename(os.getcwd()) + if not args.multi_org: + genome_name = genome_path + data_dir_root = os.path.abspath(os.path.join('src_data')) + if not os.path.isdir(data_dir_root): + raise Exception("Could not find data dir: %s" % data_dir_root) + + db_dir_root = os.path.abspath(args.dest) + + ignore_list = ['func_annot', "apollo_source"] + if args.ignore: + ignore_list += args.ignore + + # Looking for files + log.info("Looking for fasta files in %s:" % data_dir_root) + banks = [] + for root, dirs, files in os.walk(data_dir_root, followlinks=True): + file_list = [os.path.realpath(os.path.join(root, filename)) for filename in files] + rel_path = root[len(data_dir_root) + 1:] + + skip_current = False + for ign in ignore_list: + if ign in rel_path: + skip_current = True + + if not skip_current: # skip useless data + for f in file_list: + f = os.path.basename(f) + if f.endswith('.fasta') or f.endswith('.fa') or f.endswith('.fna') or f.endswith('.faa'): + if args.multi_org: + genome_name = rel_path.split('/')[1] + + if 'protein' in f or 'pep.' in f or 'proteome' in f or f.endswith('.faa'): + seq_type = 'prot' + else: + seq_type = 'nucl' + new_bank = BlastBank(genome_name, data_dir_root, rel_path, f, db_dir_root, seq_type, genome_path, args.multi_org) + log.info("Found '%s' of type: %s" % (new_bank.fasta, new_bank.seq_type)) + banks.append(new_bank) + + if not banks: + log.info("No fasta file found.") + else: + for b in banks: + makeblastdb(b, args.dry_run, args.no_parse_seqids) + + nuc_list = collections.OrderedDict() + prot_list = collections.OrderedDict() + banks.sort(key=lambda x: x.sort_key) + for b in banks: + if b.seq_type == 'nucl': + if b.pretty_name not in nuc_list: + nuc_list[b.dest_path] = b.pretty_name + else: + nuc_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name) + else: + if b.pretty_name not in prot_list: + prot_list[b.dest_path] = b.pretty_name + else: + prot_list[b.dest_path] = "%s (%s)" % (b.pretty_name, b.fasta_file_name) + + yml_dir = os.path.abspath('blast') + yml_file_path = os.path.abspath(os.path.join(yml_dir, 'banks.yml')) + links_file_path = os.path.abspath(os.path.join(yml_dir, 'links.yml')) + if not args.dry_run: + + log.info("List of bank names (to use in links.yml):") + write_titles(banks) + + log.info("Writing bank list in '%s'" % yml_file_path) + if not os.path.exists(yml_dir): + os.makedirs(yml_dir, mode=0o755) + yml_file = open(yml_file_path, 'w') + write_yml(yml_file, nuc_list, prot_list) + + log.info("Writing automatic links to links.yml in '%s'" % links_file_path) + if os.path.exists(links_file_path): + log.info("Making backup of previous links.yml to '%s'" % (links_file_path + '.back')) + copyfile(links_file_path, links_file_path + '.back') + links_yml_file = open(links_file_path, 'w') + write_links_yml(links_yml_file, banks, args.apollo) + + else: + log.info("List of bank names (to use in links.yml):") + write_titles(banks) + log.info("Would write bank list in '%s'" % yml_file_path) + write_yml(sys.stdout, nuc_list, prot_list) + log.info("Would write links.yml in '%s'" % links_file_path) + write_links_yml(sys.stdout, banks, args.apollo) + + +def write_yml(yml_file, nuc_list, prot_list): + + nuc = "~" + prot = "~" + + if nuc_list: + nuc = "\n ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in nuc_list.items()]) + if prot_list: + prot = "\n ".join(['%s: %s' % (json.dumps(k), json.dumps(v)) for k, v in prot_list.items()]) + + print("genouest_blast:", file=yml_file) + print(" db_provider:", file=yml_file) + print(" list:", file=yml_file) + print(" nucleic:", file=yml_file) + print(" %s" % nuc, file=yml_file) + print(" proteic:", file=yml_file) + print(" %s" % prot, file=yml_file) + + +def write_links_yml(yml_file, banks, apollo): + + for bank in banks: + print("", file=yml_file) + print("# %s" % (bank.pretty_name), file=yml_file) + + link = '' + if bank.seq_type == 'prot': + spl = bank.org.split() + if len(spl) > 2: + sp_str = '/'.join(spl[:2]) + sp_str += '-' + '-'.join(spl[2:]) + else: + sp_str = '/'.join(spl) + link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/polypeptide/{id}' % (bank.path, sp_str) + elif 'genome' in bank.title: + dataset_id = bank.org.lower() + spl = dataset_id.split() + if len(spl) == 2: # Genus species => gspecies + dataset_id = spl[0][:1] + spl[1] + elif len(spl) == 3: # Genus species strain1 => gsstrain1 + dataset_id = spl[0][:1] + spl[1][:1] + spl[2] + else: # Genus species some garbage => genus_species_some_garbage + dataset_id = dataset_id.replace(' ', '_') + if apollo: + link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a> <a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/apollo/annotator/loadLink?loc={id}:1{apollo_track}">Apollo</a>' + else: + link = '<a href="http://abims-gga.sb-roscoff.fr/sp/' + bank.path + '/jbrowse/?data=data%2F' + dataset_id + '&loc={id}{jbrowse_track}">{id}</a>' + else: + spl = bank.org.split() + if len(spl) > 2: + sp_str = '/'.join(spl[:2]) + sp_str += '-' + '-'.join(spl[2:]) + else: + sp_str = '/'.join(spl) + link = 'http://abims-gga.sb-roscoff.fr/sp/%s/feature/%s/mRNA/{id}' % (bank.path, sp_str) + + if link: + print("%s:" % (bank.title), file=yml_file) + print(" db: '%s'" % (bank.title), file=yml_file) + print(" '*': '%s'" % (link), file=yml_file) + else: + print("# Skipped", file=yml_file) + + +def write_titles(banks): + + for bank in banks: + print("'%s' -> '%s' [%s]" % (bank.pretty_name, bank.title, bank.first_id)) + + +def makeblastdb(bank, dry_run, no_parse_seqids): + log.info("Formatting bank: %s ---> %s" % (bank.fasta, bank.dest_path)) + dest_dir = os.path.realpath(os.path.join(bank.dest_path, '..')) + if not os.path.exists(dest_dir): + log.info("Creating folder: %s" % dest_dir) + if not dry_run: + os.makedirs(dest_dir, mode=0o755) + parse = "-parse_seqids" + if no_parse_seqids: + parse = "" + cmd = "makeblastdb -in '%s' -dbtype '%s' %s -out '%s' -title '%s'" % (bank.fasta, bank.seq_type, parse, bank.dest_path, bank.title) + log.info("Running: %s" % cmd) + if not dry_run: + try: + retcode = call(cmd, shell=True) + if retcode != 0: + raise RuntimeError("Child was terminated by signal " + str(retcode)) + except OSError as e: + print("Execution failed:" + e, file=sys.stderr) + sys.exit(1) + + +def prettify(name, capital=True): + name = name.replace('_', ' ') + name = name.replace('/', ' ') + if capital: + name = name[0].upper() + name[1:] + + return name + + +def sanitize(name): + name = name.lower() + name = name.replace(' ', '_') + name = name.replace('/', '_') + + return name + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate blast databanks and update blast forms.' + ) + parser.add_argument("-v", "--verbose", help="Increase output verbosity.", + action="store_true") + parser.add_argument("-d", "--dry-run", help="Dry run: no modification will be done, for testing purpose.", + action="store_true") + parser.add_argument("-m", "--multi-org", help="Add this flag if there are multiple organisms in src_data.", + action="store_true") + parser.add_argument("-a", "--apollo", help="Add this flag to generate links to apollo.", + action="store_true") + parser.add_argument("-p", "--no-parse-seqids", help="Don't use the makeblastdb -parse_seqids option (use this in case you have strange looking sequence ids that make html files unreadable)", + action="store_true") + parser.add_argument("--ignore", help='Files or directories to ignore', nargs='*') + parser.add_argument("dest", help="Destination directory (not including the genome name, should be mounted on compute nodes)") + + args = parser.parse_args() + log.basicConfig(level=log.INFO) + if args.verbose: + log.basicConfig(level=log.DEBUG) + + main(args) diff --git a/ext_scripts/common-stringSubsitute.py b/ext_scripts/common-stringSubsitute.py new file mode 100755 index 0000000..c32a177 --- /dev/null +++ b/ext_scripts/common-stringSubsitute.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import os +import re +import sys + +# Return the file obtained by replacing the occurrences of pattern by the replacement string. +#Â Use of python method re.sub() +# python common-stringSubsitute.py -f file -p pattern -r replacement_string +# ex : python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)' + +if __name__ == '__main__': + + #Get arguments + parser = argparse.ArgumentParser(description="Return the file obtained by replacing the occurrences of pattern by the replacement string. Use of python method re.sub(). Example: python common-stringSubsitute.py -f file -p '(tRNA)(\w{3})(\w{3})' -r '\g<1>-\g<2>(\g<3>)'") + parser.add_argument('-i','--infile', help='Input file', required=True) + parser.add_argument('-o','--outfile', help='Output file', default='outfile') + parser.add_argument('-p','--pattern', help='Pattern string to be replaced', required=True) + parser.add_argument('-r','--repl', help='Replacement string', required=True) + args = parser.parse_args() + + infilename=args.infile + outfilename=args.outfile + pattern=args.pattern + repl=args.repl + + infile=open(infilename,'r') + outfile=open(outfilename,'w') + + lines=infile.readlines() + + for line in lines : + line_out=re.sub(pattern,repl,line) + outfile.write(line_out) + + outfile.close() \ No newline at end of file diff --git a/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh b/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh new file mode 100755 index 0000000..0de7b9b --- /dev/null +++ b/ext_scripts/phaeoexplorer-change_pep_fasta_header.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +INFILE=$1 +OUTFILE=tmpfile + +FILE_HEADER_START=$(grep ">" $INFILE | cut -c 1-6 | sort | uniq) +HEADER_START_STRING=">mRNA." + +if [[ "$FILE_HEADER_START" == "$HEADER_START_STRING" ]] +then + /usr/local/genome2/mmo/scripts/common/common-stringSubstitute.py -i $INFILE -o $OUTFILE -p '^>mRNA' -r '>protein' + mv $OUTFILE $INFILE + echo "'>mRNA' replaced by '>protein'" +else + echo "Abort. Not all headers start with '>mRNA.':" + echo "$FILE_HEADER_START" +fi \ No newline at end of file diff --git a/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh b/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh new file mode 100755 index 0000000..957190f --- /dev/null +++ b/ext_scripts/phaeoexplorer-change_transcript_fasta_header.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +INFILE=$1 +OUTFILE=tmpfile + +./common-stringSubstitute.py -i $INFILE -o $OUTFILE -p '^>\d+ mRNA' -r '>mRNA' +mv $OUTFILE $INFILE +echo "'>[0-9]+ mRNA' replaced by '>mRNA' in $1" diff --git a/templates/compose-template.yml b/templates/compose-template.yml new file mode 100755 index 0000000..ee3e04e --- /dev/null +++ b/templates/compose-template.yml @@ -0,0 +1,229 @@ +# ./docker_data is created and filled with persistent data that should be backuped + +version: '2' +services: + proxy: + image: quay.io/abretaud/nginx-ldap:latest # Not using the default nginx image as we need the ldap module + ports: + - "9170:80" + links: + - galaxy + - jbrowse + - tripal + - apollo + - apollo-report + - blast + - wiki + volumes: + - ./src_data/:/project_data/ + #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... + - ./nginx/conf:/etc/nginx/conf.d + + tripal: + image: quay.io/galaxy-genome-annotation/tripal@sha256:4451cc3a601d109c07c7aedcc76bd41a5da7c438c8fa0862488680bd462f125b + links: + - tripaldb:postgres + - elasticsearch:elasticsearch + volumes_from: + - "galaxy" + volumes: + - ./src_data:/data:ro + environment: + BASE_URL_PATH: /sp/genus_species + UPLOAD_LIMIT: 20M + MEMORY_LIMIT: 512M + TRIPAL_GIT_CLONE_MODULES: "https://github.com/abretaud/tripal_rest_api.git[@c6f9021ea5d4c6d7c67c5bd363a7dd9359228bbc] https://github.com/tripal/tripal_elasticsearch.git[@dc7f276046e394a80a7dfc9404cf1a149006eb2a] https://github.com/tripal/tripal_analysis_interpro.git https://github.com/tripal/tripal_analysis_go.git https://github.com/tripal/tripal_analysis_blast.git https://github.com/tripal/tripal_analysis_expression.git[@7240039fdeb4579afd06bbcb989cb7795bd4c342]" + TRIPAL_DOWNLOAD_MODULES: "" + TRIPAL_ENABLE_MODULES: "tripal_analysis_blast tripal_analysis_interpro tripal_analysis_go tripal_rest_api tripal_elasticsearch" + SITE_NAME: "Genus species" + ENABLE_JBROWSE: /jbrowse/?data=data/gspecies + ENABLE_APOLLO: 1 + ENABLE_BLAST: 1 + ENABLE_DOWNLOAD: 1 + ENABLE_WIKI: 1 + ENABLE_GO: /organism/Genus/species?pane=GO + ENABLE_ORTHOLOGY: 0 + ENABLE_ORTHOLOGY_LINKS: http://localhost/sp/orthology/ + #THEME: "bipaa" # Use this to use another theme + #THEME_GIT_CLONE: "https://gitlab.inria.fr/abretaud/tripal_bipaa.git" # Use this to install another theme + ADMIN_PASSWORD: XXXXXX # You need to define it and update it in galaxy config below + + tripaldb: + image: quay.io/galaxy-genome-annotation/chado:1.31-jenkins110.1-pg9.5 + environment: + - POSTGRES_PASSWORD=postgres + # The default chado image would try to install the schema on first run, + # we just want the tools to be available. + - INSTALL_CHADO_SCHEMA=0 + volumes: + - ./docker_data/tripal_db/:/var/lib/postgresql/data/ + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:6.6.1 + #mem_limit: 500m # This is to limit usage, but can make the docker crash when ram is exhausted, not recommended while indexing all data, ok once indexing is done + ulimits: + memlock: + soft: -1 + hard: -1 + volumes: + - ./docker_data/elastic_search_index/:/usr/share/elasticsearch/data + environment: + bootstrap.memory_lock: "true" + xpack.security.enabled: "false" + xpack.monitoring.enabled: "false" + xpack.ml.enabled: "false" + xpack.graph.enabled: "false" + xpack.watcher.enabled: "false" + cluster.routing.allocation.disk.threshold_enabled: "false" + ES_JAVA_OPTS: "-Xms200m -Xmx200m" + TAKE_FILE_OWNERSHIP: "true" + + galaxy: + image: quay.io/galaxy-genome-annotation/docker-galaxy-annotation:gmod + volumes: + - ../galaxy_data_libs_SI.py:/opt/setup_data_libraries.py + - ./docker_data/galaxy:/export + - ./src_data/:/project_data:ro + #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... + - ./docker_data/jbrowse/:/jbrowse/data/ + - ./docker_data/apollo/:/apollo-data/ + links: + - "tripaldb:chado" + environment: + NONUSE: nodejs,proftp,reports + GALAXY_LOGGING: full + GALAXY_CONFIG_BRAND: "Genus species" + GALAXY_CONFIG_ALLOW_LIBRARY_PATH_PASTE: "True" + GALAXY_CONFIG_USE_REMOTE_USER: "True" + GALAXY_CONFIG_REMOTE_USER_MAILDOMAIN: "bipaa" + GALAXY_CONFIG_ADMIN_USERS: "admin@galaxy.org,gogepp@bipaa" # admin@galaxy.org is the default (leave it), gogepp@bipaa is a shared ldap user we use to connect + ENABLE_FIX_PERMS: 0 + PROXY_PREFIX: /sp/genus_species/galaxy + GALAXY_CONFIG_COOKIE_PATH: /galaxy + GALAXY_TRIPAL_PASSWORD: XXXXXX # See tripal config above + GALAXY_WEBAPOLLO_URL: http://apollo:8080 + GALAXY_WEBAPOLLO_USER: "admin_apollo@bipaa" + GALAXY_WEBAPOLLO_PASSWORD: "XXXXXX" # See tripal config below + GALAXY_WEBAPOLLO_EXT_URL: /sp/genus_species/apollo + GALAXY_CHADO_DBHOST: chado + GALAXY_CHADO_DBSCHEMA: chado + GALAXY_AUTO_UPDATE_DB: 1 + GALAXY_AUTO_UPDATE_CONDA: 1 + GALAXY_AUTO_UPDATE_TOOLS: "/galaxy-central/tools_1.yaml" + BLAT_ENABLED: 1 + + jbrowse: + image: quay.io/galaxy-genome-annotation/jbrowse:v1.16.5 + volumes: + - /data + volumes_from: + - "galaxy:ro" + ports: + - "80" + + apollo: + image: quay.io/abretaud/apollo:bipaa + links: + - "apollo_db:db" + environment: + APOLLO_ADMIN_EMAIL: "admin_apollo@bipaa" # internal admin user, used by some scripts/api + APOLLO_ADMIN_PASSWORD: "XXXXXX" # define it and adapt galaxy config above + APOLLO_BASE_HOST: "http://localhost" + APOLLO_PATH_PREFIX: "/sp/genus_species/apollo/" + APOLLO_REMOTE_ADMINS: "gogepp@bipaa,abretaud@bipaa,srobin@bipaa,flegeai@bipaa" # all ldap users that we use to connect as admin + WEBAPOLLO_DB_USERNAME: postgres + WEBAPOLLO_DB_PASSWORD: password + WEBAPOLLO_DB_DRIVER: "org.postgresql.Driver" + WEBAPOLLO_DB_DIALECT: "org.hibernate.dialect.PostgresPlusDialect" + WEBAPOLLO_DB_URI: "jdbc:postgresql://db/postgres" + WEBAPOLLO_FEATURE_HAS_DBXREFS: "true" + WEBAPOLLO_FEATURE_HAS_ATTRS: "true" + WEBAPOLLO_FEATURE_HAS_PUBMED: "true" + WEBAPOLLO_FEATURE_HAS_GO: "true" + WEBAPOLLO_FEATURE_HAS_COMMENTS: "true" + WEBAPOLLO_FEATURE_HAS_STATUS: "true" + CATALINA_OPTS: "-Xms512m -Xmx1g -XX:+CMSClassUnloadingEnabled -XX:+CMSPermGenSweepingEnabled -XX:+UseConcMarkSweepGC" + volumes_from: + - "galaxy:ro" + volumes: + - ./apollo/annotation_groups.tsv:/bootstrap/canned_values.txt:ro # Other canned things are preloaded in the docker image https://github.com/abretaud/docker-apollo/blob/bipaa/bootstrap.sh + #- ../blat/:/opt/blat/:ro # Mount the blat binary if you want to use it (could not include it in the docker image due to license issue) + + apollo_db: + image: postgres:9.5 + environment: + POSTGRES_PASSWORD: password + volumes: + - ./docker_data/apollo_db/:/var/lib/postgresql/data/ + + apollo-report: # A report app following guidelines from https://bipaa.genouest.org/is/how-to-annotate-a-genome/ + links: + - apollo:apollo + image: quay.io/abretaud/apollo-report:latest + environment: + APOLLO_EXT_URL: http://localhost/sp/genus_species/apollo/ + APOLLO_USER: admin_apollo@bipaa + APOLLO_PASS: XXXXX # See apollo conf above + ADMIN_USERS: login1,login2 # ldap users that should see an extended report + APOLLO_MOUNTPOINT: /apollo-data/ + volumes: + - ./docker_data/apollo/:/apollo-data/:ro + - ./apollo/annotation_groups.tsv:/data/annotation_groups.tsv:ro + - ./docker_data/apollo_report/:/data/report/ + + blast: + image: quay.io/abretaud/sf-blast:latest + links: + - blast_db:postgres + #hostname: gogepp-blast.genouest.org # Hostname declare as submit node in sge conf (for drmaa mode only) + environment: + UPLOAD_LIMIT: 20M + MEMORY_LIMIT: 128M + DB_NAME: 'postgres' + ADMIN_EMAIL: 'xxx@example.org' # email sender + ADMIN_NAME: 'xxxxx' # email sender name + JOBS_METHOD: 'local' # Can be local (= no sge jobs, but run inside the container) or drmaa (= to submit to a cluster) + JOBS_WORK_DIR: '/xxxx/blast_jobs/' # disk accessible both from compute nodes and mounted in this docker (at the same path) + CDD_DELTA_PATH: '/db/cdd_delta/current/flat/cdd_delta' + BLAST_TITLE: 'Genus species blast server' + JOBS_SCHED_NAME: 'blast_gspecies' # job names + PRE_CMD: '. /local/env/envblast-2.6.0.sh; . /local/env/envpython-2.7.sh;' # executed at the beginning of each job + APACHE_RUN_USER: 'bipaaweb' # username known by sge + APACHE_RUN_GROUP: 'bipaa' # group known by sge + BASE_URL_PATH: '/sp/genus_species/blast/' + UID: 55914 # username known by sge (for drmaa mode only) + GID: 40259 # group known by sge (for drmaa mode only) + volumes: + #- ../blast-themes/xxx/:/var/www/blast/app/Resources/:ro # You can theme the app + - /data1/sge/:/usr/local/sge/:ro # the sge install + #- /xxxx/blast_jobs/:/xxxx/blast_jobs/ # (for drmaa mode only) + - ./blast/banks.yml:/var/www/blast/app/config/banks.yml:ro + - ./blast/links.yml:/etc/blast_links/links.yml:ro + + blast_db: + image: postgres:9.5 + environment: + - POSTGRES_PASSWORD=postgres + - PGDATA=/var/lib/postgresql/data/ + volumes: + - ./docker_data/blast_db/:/var/lib/postgresql/data/ + + wiki: + image: quay.io/abretaud/mediawiki + environment: + MEDIAWIKI_SERVER: http://localhost + MEDIAWIKI_PROXY_PREFIX: /sp/genus_species/wiki + MEDIAWIKI_SITENAME: Genus species + MEDIAWIKI_SECRET_KEY: XXXXXXXXXX + MEDIAWIKI_DB_PASSWORD: password + MEDIAWIKI_ADMIN_USER: abretaud # ldap user + links: + - wiki_db:db + volumes: + - ./docker_data/wiki_uploads:/images + #- ../bipaa_wiki.png:/var/www/mediawiki/resources/assets/wiki.png:ro # To cange the logo at the top left + + wiki_db: + image: postgres:9.5 + volumes: + - ./docker_data/wiki_db/:/var/lib/postgresql/data/ diff --git a/templates/stack_template.yml b/templates/stack_template.yml new file mode 100755 index 0000000..68adc74 --- /dev/null +++ b/templates/stack_template.yml @@ -0,0 +1,286 @@ +# ./docker_data is created and filled with persistent data that should be backuped + +version: '3.7' +services: + proxy: + image: quay.io/abretaud/nginx-ldap:latest + volumes: + - ./src_data/:/project_data/ + #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... + - ./nginx/conf:/etc/nginx/conf.d + networks: + - traefik + - genus_species + deploy: + labels: + # Download page + - "traefik.http.routers.genus_species-nginx.rule=(Host(`localhost`) && PathPrefix(`/sp/genus_species/download`))" + - "traefik.http.routers.genus_species-nginx.tls=true" + - "traefik.http.routers.genus_species-nginx.entryPoints=webs" + - "traefik.http.routers.genus_species-nginx.middlewares=sp-auth,sp-app-trailslash,sp-prefix" + - "traefik.http.services.genus_species-nginx.loadbalancer.server.port=80" + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + + tripal: + image: quay.io/galaxy-genome-annotation/tripal:v2.x + depends_on: + - tripal-db + - elasticsearch + volumes: + - ./docker_data/galaxy/:/export/:ro + - ./src_data/:/project_data/:ro + - ./src_data:/data:ro + #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... + environment: + DB_HOST: tripal-db.genus_species + BASE_URL_PATH: /sp/genus_species + UPLOAD_LIMIT: 20M + MEMORY_LIMIT: 512M + TRIPAL_GIT_CLONE_MODULES: "https://github.com/abretaud/tripal_rest_api.git[@c6f9021ea5d4c6d7c67c5bd363a7dd9359228bbc] https://github.com/tripal/tripal_elasticsearch.git[@dc7f276046e394a80a7dfc9404cf1a149006eb2a] https://github.com/tripal/tripal_analysis_interpro.git https://github.com/tripal/tripal_analysis_go.git https://github.com/tripal/tripal_analysis_blast.git https://github.com/tripal/tripal_analysis_expression.git[@7240039fdeb4579afd06bbcb989cb7795bd4c342]" + TRIPAL_DOWNLOAD_MODULES: "" + TRIPAL_ENABLE_MODULES: "tripal_analysis_blast tripal_analysis_interpro tripal_analysis_go tripal_rest_api tripal_elasticsearch" + SITE_NAME: "Genus species" + ELASTICSEARCH_HOST: elasticsearch.genus_species + ENABLE_JBROWSE: /jbrowse/?data=data/gspecies + ENABLE_APOLLO: https://localhost/apollo/ + ENABLE_BLAST: 1 + ENABLE_DOWNLOAD: 1 + ENABLE_WIKI: 1 + ENABLE_GO: /organism/Genus/species?pane=GO + ENABLE_ORTHOLOGY: 0 + ENABLE_ORTHOLOGY_LINKS: http://localhost/sp/orthology/ + #THEME: "bipaa" # Use this to use another theme + #THEME_GIT_CLONE: "https://gitlab.inria.fr/abretaud/tripal_bipaa.git" # Use this to install another theme + ADMIN_PASSWORD: XXXXXX # You need to define it and update it in galaxy config below + networks: + - traefik + - genus_species + deploy: + labels: + - "traefik.http.routers.genus_species-tripal.rule=(Host(`localhost`) && PathPrefix(`/sp/genus_species`))" + - "traefik.http.routers.genus_species-tripal.tls=true" + - "traefik.http.routers.genus_species-tripal.entryPoints=webs" + - "traefik.http.routers.genus_species-tripal.middlewares=sp-auth,sp-trailslash,sp-prefix,tripal-addprefix" + - "traefik.http.services.genus_species-tripal.loadbalancer.server.port=80" + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + + tripal-db: + image: quay.io/galaxy-genome-annotation/chado:1.31-jenkins26-pg9.5 + environment: + - POSTGRES_PASSWORD=postgres + # The default chado image would try to install the schema on first run, + # we just want the tools to be available. + - INSTALL_CHADO_SCHEMA=0 + volumes: + - ./docker_data/tripal_db/:/var/lib/postgresql/data/ + networks: + - genus_species + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:6.6.1 + #deploy: + #resources: + #limits: + #memory: 500M + volumes: + - ./docker_data/elastic_search_index/:/usr/share/elasticsearch/data/ + environment: + bootstrap.memory_lock: "true" + xpack.security.enabled: "false" + xpack.monitoring.enabled: "false" + xpack.ml.enabled: "false" + xpack.graph.enabled: "false" + xpack.watcher.enabled: "false" + cluster.routing.allocation.disk.threshold_enabled: "false" + ES_JAVA_OPTS: "-Xms500m -Xmx500m" + TAKE_FILE_OWNERSHIP: "true" + networks: + - genus_species + + galaxy: + image: quay.io/galaxy-genome-annotation/docker-galaxy-annotation:gmod + volumes: + - ../galaxy_data_libs_SI.py:/opt/setup_data_libraries.py + - ./docker_data/galaxy/:/export/ + - ./src_data/:/project_data/:ro + #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... + - ./docker_data/jbrowse/:/jbrowse/data/ + - ./docker_data/apollo/:/apollo-data/ + - ../galaxy_nginx.conf:/etc/nginx/uwsgi_params + environment: + NONUSE: nodejs,proftp,reports,condor + GALAXY_LOGGING: full + GALAXY_CONFIG_BRAND: "Genus species" + GALAXY_CONFIG_ALLOW_LIBRARY_PATH_PASTE: "True" + GALAXY_CONFIG_USE_REMOTE_USER: "True" + GALAXY_CONFIG_REMOTE_USER_MAILDOMAIN: "bipaa" + GALAXY_CONFIG_ADMIN_USERS: "admin@galaxy.org,gogepp@bipaa" # admin@galaxy.org is the default (leave it), gogepp@bipaa is a shared ldap user we use to connect + ENABLE_FIX_PERMS: 0 + PROXY_PREFIX: /sp/genus_species/galaxy + GALAXY_TRIPAL_URL: http://tripal.genus_species/tripal/ + GALAXY_TRIPAL_PASSWORD: XXXXXX # See tripal config above + GALAXY_WEBAPOLLO_URL: http://one-of-the-swarm-node:8888/apollo/ + GALAXY_WEBAPOLLO_USER: "admin_apollo@bipaa" + GALAXY_WEBAPOLLO_PASSWORD: "XXXXXX" # See tripal config below + GALAXY_WEBAPOLLO_EXT_URL: /apollo/ + GALAXY_CHADO_DBHOST: tripal-db.genus_species + GALAXY_CHADO_DBSCHEMA: chado + GALAXY_AUTO_UPDATE_DB: 1 + GALAXY_AUTO_UPDATE_CONDA: 1 + GALAXY_AUTO_UPDATE_TOOLS: "/galaxy-central/tools_1.yaml" + GALAXY_SHARED_DIR: "" + BLAT_ENABLED: 1 + master_api_key: MASTERLOCK + networks: + - traefik + - genus_species + deploy: + labels: + - "traefik.http.routers.genus_species-galaxy.rule=(Host(`localhost`) && PathPrefix(`/sp/genus_species/galaxy`))" + - "traefik.http.routers.genus_species-galaxy.tls=true" + - "traefik.http.routers.genus_species-galaxy.entryPoints=webs" + - "traefik.http.routers.genus_species-galaxy.middlewares=sp-auth,sp-app-trailslash,sp-app-prefix" + - "traefik.http.services.genus_species-galaxy.loadbalancer.server.port=80" + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + + jbrowse: + image: quay.io/galaxy-genome-annotation/jbrowse:v1.16.8 + volumes: + - ./docker_data/galaxy/:/export/:ro + - ./src_data/:/project_data/:ro + #- /groups/XXX/:/groups/XXX/:ro # We do this when we have symlinks in src_data pointing to /groups/XXX/... + - ./docker_data/jbrowse/:/jbrowse/data/:ro + networks: + - traefik + - genus_species + deploy: + labels: + - "traefik.http.routers.genus_species-jbrowse.rule=(Host(`localhost`) && PathPrefix(`/sp/genus_species/jbrowse`))" + - "traefik.http.routers.genus_species-jbrowse.tls=true" + - "traefik.http.routers.genus_species-jbrowse.entryPoints=webs" + - "traefik.http.routers.genus_species-jbrowse.middlewares=sp-auth,sp-app-trailslash,sp-app-prefix" + - "traefik.http.services.genus_species-jbrowse.loadbalancer.server.port=80" + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + + blast: + image: quay.io/abretaud/sf-blast:latest + depends_on: + - blast-db + environment: + DB_HOST: blast-db.genus_species + UPLOAD_LIMIT: 20M + MEMORY_LIMIT: 128M + DB_NAME: 'postgres' + ADMIN_EMAIL: 'xxx@example.org' # email sender + ADMIN_NAME: 'xxxxx' # email sender name + JOBS_METHOD: 'local' # Can be local (= no sge jobs, but run inside the container) or drmaa (= to submit to a cluster) + JOBS_WORK_DIR: '/xxxx/blast_jobs/' # disk accessible both from compute nodes and mounted in this docker (at the same path) + CDD_DELTA_PATH: '/db/cdd_delta/current/flat/cdd_delta' + BLAST_TITLE: 'Genus species blast server' + JOBS_SCHED_NAME: 'blast_gspecies' # job names + PRE_CMD: '. /local/env/envblast-2.6.0.sh; . /local/env/envpython-3.7.1.sh;' # executed at the beginning of each job + APACHE_RUN_USER: 'bipaaweb' # username known by sge + APACHE_RUN_GROUP: 'bipaa' # group known by sge + BASE_URL_PATH: '/sp/genus_species/blast/' + UID: 55914 # username known by sge (for drmaa mode only) + GID: 40259 # group known by sge (for drmaa mode only) + #JOBS_DRMAA_NATIVE: '-p web' # This line and following for slurm + #DRMAA_METHOD: 'slurm' # This line and following for slurm + volumes: + #- ../blast-themes/xxx/:/var/www/blast/app/Resources/:ro # You can theme the app + #- /data1/sge/:/usr/local/sge/:ro # an sge install + #- /xxxx/blast_jobs/:/xxxx/blast_jobs/ # (for drmaa mode only) + - ./blast/banks.yml:/var/www/blast/app/config/banks.yml:ro + - ./blast/links.yml:/etc/blast_links/links.yml:ro + #- /data1/slurm/slurm.conf:/etc/slurm-llnl/slurm.conf:ro # This line and following for slurm + #- /data1/slurm/gres.conf:/etc/slurm-llnl/gres.conf:ro + #- /data1/slurm/cgroup.conf:/etc/slurm-llnl/cgroup.conf:ro + #- /data1/slurm/slurmdbd.conf:/etc/slurm-llnl/slurmdbd.conf:ro + #- /data1/slurm/drmaa/:/etc/slurm-llnl/drmaa/:ro + #- /etc/munge/:/etc/munge/:ro + networks: + - traefik + - genus_species + deploy: + labels: + - "traefik.http.routers.genus_species-blast.rule=(Host(`localhost`) && PathPrefix(`/sp/genus_species/blast`))" + - "traefik.http.routers.genus_species-blast.tls=true" + - "traefik.http.routers.genus_species-blast.entryPoints=webs" + - "traefik.http.routers.genus_species-blast.middlewares=sp-big-req,sp-auth,sp-app-trailslash,sp-app-prefix" + - "traefik.http.services.genus_species-blast.loadbalancer.server.port=80" + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + + blast-db: + image: postgres:9.6-alpine + environment: + - POSTGRES_PASSWORD=postgres + - PGDATA=/var/lib/postgresql/data/ + volumes: + - ./docker_data/blast_db/:/var/lib/postgresql/data/ + networks: + - genus_species + + wiki: + image: quay.io/abretaud/mediawiki + environment: + MEDIAWIKI_SERVER: http://localhost + MEDIAWIKI_PROXY_PREFIX: /sp/genus_species/wiki + MEDIAWIKI_SITENAME: Genus species + MEDIAWIKI_SECRET_KEY: XXXXXXXXXX + MEDIAWIKI_DB_HOST: wiki-db.genus_species + MEDIAWIKI_DB_PASSWORD: password + MEDIAWIKI_ADMIN_USER: abretaud # ldap user + depends_on: + - wiki-db + volumes: + - ./docker_data/wiki_uploads:/images + #- ../bipaa_wiki.png:/var/www/mediawiki/resources/assets/wiki.png:ro # To change the logo at the top left + networks: + - traefik + - genus_species + deploy: + labels: + - "traefik.http.routers.genus_species-blast.rule=(Host(`localhost`) && PathPrefix(`/sp/genus_species/blast`))" + - "traefik.http.routers.genus_species-blast.tls=true" + - "traefik.http.routers.genus_species-blast.entryPoints=webs" + - "traefik.http.routers.genus_species-blast.middlewares=sp-big-req,sp-auth,sp-app-trailslash,sp-app-prefix" + - "traefik.http.services.genus_species-blast.loadbalancer.server.port=80" + restart_policy: + condition: on-failure + delay: 5s + max_attempts: 3 + window: 120s + + wiki-db: + image: postgres:9.6-alpine + volumes: + - ./docker_data/wiki_db/:/var/lib/postgresql/data/ + networks: + - genus_species + +networks: + traefik: + external: true + genus_species: + driver: overlay + name: genus_species -- GitLab