diff --git a/.gitattributes b/.gitattributes index 3e0ca4ab..7f4d5667 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,3 +10,4 @@ mafft binary .gitattributes export-ignore .gitignore export-ignore .github export-ignore +*.mat filter=lfs diff=lfs merge=lfs -text diff --git a/doc/external/kegg/getKEGGModelForOrganism.html b/doc/external/kegg/getKEGGModelForOrganism.html index e5ec6a91..1aeb3f42 100644 --- a/doc/external/kegg/getKEGGModelForOrganism.html +++ b/doc/external/kegg/getKEGGModelForOrganism.html @@ -63,7 +63,7 @@

DESCRIPTION ^DESCRIPTION ^SOURCE CODE ^% The hidden Markov models as generated in 2b or 0039 % downloaded from BioMet Toolbox (see below) 0040 % The final directory in dataDir should be styled as -0041 % prok90_kegg105 or euk90_kegg105, indicating whether +0041 % prok90_kegg116 or euk90_kegg116, indicating whether 0042 % the HMMs were trained on pro- or eukaryotic 0043 % sequences; using which sequence similarity treshold 0044 % (first set of digits); using which KEGG version @@ -543,36 +543,36 @@

SOURCE CODE ^else 0260 outDir=char(outDir); 0261 end -0262 if nargin<5 +0262 if nargin<5 || isempty(keepSpontaneous) 0263 keepSpontaneous=true; 0264 end -0265 if nargin<6 +0265 if nargin<6 || isempty(keepUndefinedStoich) 0266 keepUndefinedStoich=true; 0267 end -0268 if nargin<7 +0268 if nargin<7 || isempty(keepIncomplete) 0269 keepIncomplete=true; 0270 end -0271 if nargin<8 +0271 if nargin<8 || isempty(keepGeneral) 0272 keepGeneral=false; 0273 end -0274 if nargin<9 +0274 if nargin<9 || isempty(cutOff) 0275 cutOff=10^-50; 0276 end -0277 if nargin<10 +0277 if nargin<10 || isempty(minScoreRatioKO) 0278 minScoreRatioKO=0.3; 0279 end -0280 if nargin<11 +0280 if nargin<11 || isempty(minScoreRatioG) 0281 minScoreRatioG=0.8; 0282 end -0283 if nargin<12 +0283 if nargin<12 || isempty(maxPhylDist) 0284 maxPhylDist=inf; 0285 %Include all sequences for each reaction 0286 end -0287 if nargin<13 +0287 if nargin<13 || isempty(nSequences) 0288 nSequences=inf; 0289 %Include all sequences for each reaction 0290 end -0291 if nargin<14 +0291 if nargin<14 || isempty(seqIdentity) 0292 seqIdentity=0.9; 0293 end 0294 @@ -599,9 +599,9 @@

SOURCE CODE ^%required zip file already in working directory or have it extracted. If 0316 %the zip file and directory is not here, it is downloaded from the cloud 0317 if ~isempty(dataDir) -0318 hmmOptions={'euk90_kegg105','prok90_kegg105'}; +0318 hmmOptions={'euk90_kegg116','prok90_kegg116'}; 0319 if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions. -0320 %If not, then check whether the required folders exist anyway. +0320 %If not, then check whether the required folders exist anyway. 0321 if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ... 0322 ~isfolder(fullfile(dataDir,'fasta')) && ... 0323 ~isfolder(fullfile(dataDir,'aligned')) && ... @@ -623,14 +623,14 @@

SOURCE CODE ^else 0340 fprintf('Downloading the HMMs archive file... '); 0341 try -0342 websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']); +0342 websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.11.0/',hmmOptions{hmmIndex},'.zip']); 0343 catch ME 0344 if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError') 0345 error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues') 0346 end 0347 end 0348 end -0349 +0349 0350 fprintf('COMPLETE\n'); 0351 fprintf('Extracting the HMMs archive file... '); 0352 unzip([dataDir,'.zip']); @@ -690,7 +690,7 @@

SOURCE CODE ^if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) 0407 error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'); 0408 end -0409 +0409 0410 fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']); 0411 if ismember(organismID,{'eukaryotes','prokaryotes'}) 0412 phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); @@ -836,496 +836,511 @@

SOURCE CODE ^return 0553 end 0554 -0555 %Check if alignment of FASTA files should be performed -0556 missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); -0557 if ~isempty(missingAligned) -0558 if seqIdentity==-1 -0559 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); -0560 else -0561 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); -0562 end -0563 missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); -0564 tmpFile=tempname; -0565 %On Windows, paths need to be translated to Unix before parsing it to WSL -0566 if ispc -0567 wslPath.tmpFile=getWSLpath(tmpFile); -0568 %mafft has problems writing to terminal (/dev/stderr) when running -0569 %on WSL via MATLAB, instead write and read progress file -0570 mafftOutput = tempname; -0571 wslPath.mafftOutput=getWSLpath(mafftOutput); -0572 wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); -0573 wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); -0574 end -0575 -0576 for i=1:numel(missingAligned) -0577 %This is checked here because it could be that it is created by a -0578 %parallel process. The faw-files are saved as temporary files to -0579 %kept track of which files are being worked on -0580 if ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])) &&... -0581 ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.fa'])) -0582 %Check that the multi-FASTA file exists. It should do so since -0583 %we are saving empty files as well. Print a warning and -0584 %continue if not -0585 if ~isfile(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])) -0586 EM=['WARNING: The multi-FASTA file for ' missingAligned{i} ' does not exist']; -0587 dispEM(EM,false); -0588 continue; -0589 end -0590 -0591 %If the multi-FASTA file is empty then save an empty aligned -0592 %file and continue -0593 s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); -0594 if s.bytes<=0 -0595 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'w'); -0596 fclose(fid); -0597 continue; -0598 end -0599 -0600 %Create an empty file to prevent other threads to start to work -0601 %on the same alignment -0602 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); -0603 fclose(fid); -0604 -0605 %First load the FASTA file, then select up to nSequences -0606 %sequences of the most closely related species, apply any -0607 %constraints from maxPhylDist, and save it as a temporary file, -0608 %and create the model from that -0609 -0610 fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); -0611 phylDist=inf(numel(fastaStruct),1); -0612 for j=1:numel(fastaStruct) -0613 %Get the organism abbreviation -0614 index=strfind(fastaStruct(j).Header,':'); -0615 if any(index) -0616 abbrev=fastaStruct(j).Header(1:index(1)-1); -0617 [~, index]=ismember(abbrev,phylDistStruct.ids); -0618 if any(index) -0619 phylDist(j)=phylDistStruct.distMat(index(1),phylDistId); -0620 end -0621 end -0622 end -0623 -0624 %Inf means that it should not be included -0625 phylDist(phylDist>maxPhylDist)=[]; -0626 -0627 %Sort based on phylDist -0628 [~, order]=sort(phylDist); -0629 -0630 %Save the first nSequences hits to a temporary FASTA file -0631 if nSequences<=numel(fastaStruct) -0632 fastaStruct=fastaStruct(order(1:nSequences)); -0633 else -0634 fastaStruct=fastaStruct(order); -0635 end -0636 -0637 %Do the clustering and alignment if there are more than one -0638 %sequences, otherwise just save the sequence (or an empty file) -0639 if numel(fastaStruct)>1 -0640 if seqIdentity~=-1 -0641 cdhitInpCustom=tempname; -0642 fastawrite(cdhitInpCustom,fastaStruct); -0643 if seqIdentity<=1 && seqIdentity>0.7 -0644 nparam='5'; -0645 elseif seqIdentity>0.6 -0646 nparam='4'; -0647 elseif seqIdentity>0.5 -0648 nparam='3'; -0649 elseif seqIdentity>0.4 -0650 nparam='2'; -0651 else -0652 EM='The provided seqIdentity must be between 0 and 1\n'; -0653 dispEM(EM); -0654 end -0655 if ispc -0656 wslPath.cdhitInpCustom=getWSLpath(cdhitInpCustom); -0657 [status, output]=system(['wsl "' wslPath.cdhit '" -T "' num2str(cores) '" -i "' wslPath.cdhitInpCustom '" -o "' wslPath.tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); -0658 elseif ismac || isunix -0659 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); -0660 end -0661 if status~=0 -0662 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0663 dispEM(EM); -0664 end -0665 %Remove the old tempfile -0666 if exist(cdhitInpCustom, 'file') -0667 delete([cdhitInpCustom '*']); -0668 end -0669 else -0670 %This means that CD-HIT should be skipped since -0671 %seqIdentity is equal to -1 -0672 fastawrite(tmpFile,fastaStruct); -0673 end -0674 %Do the alignment for this file -0675 if ismac -0676 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-mac','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0677 elseif isunix -0678 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0679 elseif ispc -0680 wslPath.fawFile=getWSLpath(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])); -0681 [status, ~]=system(['wsl "' wslPath.mafft '" --auto --anysymbol --progress "' wslPath.mafftOutput '" --thread "' num2str(cores) '" --out "' wslPath.fawFile '" "' wslPath.tmpFile '"']); -0682 output=fileread(mafftOutput); -0683 delete(mafftOutput); -0684 end -0685 if status~=0 -0686 %It could be that alignment failed because only one -0687 %sequence was left after clustering. If that is the -0688 %case, then the clustered file is just copied as 'faw' -0689 %file -0690 if any(regexp(output,'Only 1 sequence found')) -0691 movefile(tmpFile,fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'f'); -0692 else -0693 EM=['Error when performing alignment of ' missingAligned{i} ':\n' output]; -0694 dispEM(EM); -0695 end -0696 end -0697 %Remove the old tempfile -0698 if exist(tmpFile, 'file') -0699 delete([tmpFile '*']); -0700 end -0701 else -0702 %If there is only one sequence then it's not possible to do -0703 %a multiple alignment. Just print the sequence instead. An -0704 %empty file was written previously so that doesn't have to -0705 %be dealt with -0706 if numel(fastaStruct)==1 -0707 warnState = warning; %Save the current warning state -0708 warning('off','Bioinfo:fastawrite:AppendToFile'); -0709 fastawrite(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fastaStruct); -0710 warning(warnState) %Reset warning state to previous settings -0711 end -0712 end -0713 %Move the temporary file to the real one -0714 movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); -0715 -0716 %Print the progress every 25 files -0717 if rem(i-1,25) == 0 -0718 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns))); -0719 progress=pad(progress,3,'left'); -0720 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0721 end -0722 end -0723 end -0724 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0725 else -0726 if seqIdentity==-1 -0727 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); -0728 else -0729 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); -0730 end -0731 end -0732 -0733 %Check if training of Hidden Markov models should be performed -0734 missingHMMs=setdiff(KOModel.rxns,[hmmFiles;outFiles]); -0735 if ~isempty(missingHMMs) -0736 fprintf('Training the KEGG Orthology specific HMMs... 0%% complete'); -0737 missingHMMs=missingHMMs(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingHMMs))); -0738 %Train models for all missing KOs -0739 for i=1:numel(missingHMMs) -0740 %This is checked here because it could be that it is created by a -0741 %parallel process -0742 if ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])) && ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])) -0743 %Check that the aligned FASTA file exists. It could be that it -0744 %is still being worked on by some other instance of the program -0745 %(the .faw file should then exist). This should not happen on a -0746 %single computer. It doesn't throw an error, because it should -0747 %finalize the ones it can -0748 if ~isfile(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])) -0749 EM=['The aligned FASTA file for ' missingHMMs{i} ' does not exist']; -0750 dispEM(EM,false); -0751 continue; -0752 end -0753 -0754 %If the multi-FASTA file is empty then save an empty aligned -0755 %file and continue -0756 s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); -0757 if s.bytes<=0 -0758 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'w'); -0759 fclose(fid); -0760 continue; -0761 end -0762 %Create a temporary file to indicate that it is working on the -0763 %KO. This is because hmmbuild cannot overwrite existing files -0764 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); -0765 fclose(fid); -0766 -0767 %Create HMM -0768 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); -0769 if status~=0 -0770 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; -0771 dispEM(EM); -0772 end -0773 -0774 %Delete the temporary file -0775 delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); -0776 -0777 %Print the progress every 25 files -0778 if rem(i-1,25) == 0 -0779 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'hmms','*.hmm')))/numel(KOModel.rxns))); -0780 progress=pad(progress,3,'left'); -0781 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0782 end -0783 end -0784 end -0785 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0786 else -0787 fprintf('Training the KEGG Orthology specific HMMs... COMPLETE\n'); -0788 end -0789 -0790 %Check which new .out files that should be generated. Check if training of -0791 %Hidden Markov models should be performed -0792 missingOUT=setdiff(KOModel.rxns,outFiles); -0793 if ~isempty(missingOUT) -0794 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... 0%% complete'); -0795 missingOUT=missingOUT(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingOUT))); -0796 for i=1:numel(missingOUT) -0797 %This is checked here because it could be that it is created by a -0798 %parallel process -0799 if ~isfile(fullfile(outDir,[missingOUT{i} '.out'])) -0800 %Check that the HMM file exists. It should do so since %we are -0801 %saving empty files as well. Print a warning and continue if -0802 %not -0803 if ~isfile(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])) -0804 EM=['The HMM file for ' missingOUT{i} ' does not exist']; -0805 dispEM(EM,false); -0806 continue; -0807 end -0808 -0809 %Save an empty file to prevent several threads working on the -0810 %same file -0811 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); -0812 fclose(fid); -0813 -0814 %If the HMM file is empty then save an out file and continue -0815 s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); -0816 if s.bytes<=0 -0817 continue; -0818 end -0819 -0820 %Check each gene in the input file against this model -0821 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); -0822 if status~=0 -0823 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; -0824 dispEM(EM); -0825 end -0826 -0827 %Save the output to a file -0828 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); -0829 fwrite(fid,output); -0830 fclose(fid); -0831 -0832 %Print the progress every 25 files -0833 if rem(i-1,25) == 0 -0834 progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns))); -0835 progress=pad(progress,3,'left'); -0836 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0837 end -0838 end -0839 end -0840 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0841 else -0842 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... COMPLETE\n'); -0843 end -0844 -0845 -0846 %***Begin retrieving the output and putting together the resulting model -0847 -0848 fprintf('Parsing the HMM search results... '); -0849 %Retrieve matched genes from the HMMs -0850 koGeneMat=zeros(numel(KOModel.rxns),3000); %Make room for 3000 genes -0851 genes=cell(3000,1); -0852 %Store the best score for a gene in a hash list (since it will be searching -0853 %many times) -0854 hTable = java.util.Hashtable; -0855 -0856 geneCounter=0; -0857 for i=1:numel(KOModel.rxns) -0858 if exist(fullfile(outDir,[KOModel.rxns{i} '.out']), 'file') -0859 fid=fopen(fullfile(outDir,[KOModel.rxns{i} '.out']),'r'); -0860 beginMatches=false; -0861 while 1 -0862 %Get the next line -0863 tline = fgetl(fid); -0864 -0865 %Abort at end of file -0866 if ~ischar(tline) -0867 break; -0868 end -0869 -0870 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) -0871 break; -0872 end -0873 -0874 if beginMatches==false -0875 %This is how the listing of matches begins -0876 if any(strfind(tline,'E-value ')) -0877 %Read one more line that is only padding -0878 tline = fgetl(fid); -0879 beginMatches=true; -0880 end -0881 else -0882 %If matches should be read -0883 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) -0884 elements=regexp(tline,' ','split'); -0885 elements=elements(cellfun(@any,elements)); -0886 -0887 %Check if the match is below the treshhold -0888 score=str2double(elements{1}); -0889 gene=elements{9}; -0890 if score<=cutOff -0891 %If the score is exactly 0, change it to a very -0892 %small value to avoid NaN -0893 if score==0 -0894 score=10^-250; -0895 end -0896 %Check if the gene is added already and, is so, get -0897 %the best score for it -0898 I=hTable.get(gene); -0899 if any(I) -0900 koGeneMat(i,I)=score; -0901 else -0902 geneCounter=geneCounter+1; -0903 %The gene was not present yet so add it -0904 hTable.put(gene,geneCounter); -0905 genes{geneCounter}=gene; -0906 koGeneMat(i,geneCounter)=score; -0907 end -0908 end -0909 else -0910 break; -0911 end -0912 end -0913 end -0914 fclose(fid); -0915 end -0916 end -0917 fprintf('COMPLETE\n'); -0918 -0919 fprintf('Removing gene, KEGG Orthology associations below minScoreRatioKO, minScoreRatioG... '); -0920 koGeneMat=koGeneMat(:,1:geneCounter); -0921 -0922 %Remove the genes for each KO that are below minScoreRatioKO. -0923 for i=1:size(koGeneMat,1) -0924 J=find(koGeneMat(i,:)); -0925 if any(J) -0926 koGeneMat(i,J(log(koGeneMat(i,J))/log(min(koGeneMat(i,J)))<minScoreRatioKO))=0; -0927 end -0928 end -0929 -0930 %Remove the KOs for each gene that are below minScoreRatioG -0931 for i=1:size(koGeneMat,2) -0932 J=find(koGeneMat(:,i)); -0933 if any(J) -0934 koGeneMat(J(log(koGeneMat(J,i))/log(min(koGeneMat(J,i)))<minScoreRatioG),i)=0; -0935 end -0936 end -0937 fprintf('COMPLETE\n'); -0938 -0939 fprintf('Adding gene annotations to the model... '); -0940 %Create the new model -0941 model.genes=genes(1:geneCounter); -0942 model.grRules=cell(numel(model.rxns),1); -0943 model.grRules(:)={''}; -0944 model.rxnGeneMat=sparse(numel(model.rxns),numel(model.genes)); -0945 -0946 %Loop through the reactions and add the corresponding genes -0947 for i=1:numel(model.rxns) -0948 if isstruct(model.rxnMiriams{i}) -0949 %Get all KOs -0950 I=find(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology')); -0951 KOs=model.rxnMiriams{i}.value(I); -0952 %Find the KOs and the corresponding genes -0953 J=ismember(KOModel.rxns,KOs); -0954 [~, K]=find(koGeneMat(J,:)); -0955 -0956 if any(K) -0957 model.rxnGeneMat(i,K)=1; -0958 %Also delete KOs for which no genes were found. If no genes at -0959 %all were matched to the reaction it will be deleted later -0960 L=sum(koGeneMat(J,:),2)==0; -0961 model.rxnMiriams{i}.value(I(L))=[]; -0962 model.rxnMiriams{i}.name(I(L))=[]; -0963 end -0964 end -0965 end -0966 fprintf('COMPLETE\n'); -0967 -0968 %Find and delete all reactions without genes. This also removes genes that -0969 %are not used (which could happen because minScoreRatioG and -0970 %minScoreRatioKO). If keepSpontaneous==true, the spontaneous reactions -0971 %without genes are kept in the model. Spontaneous reactions with original -0972 %gene associations are treated in the same way, like the rest of the -0973 %reactions - if gene associations were removed during HMM search, such -0974 %reactions are deleted from the model -0975 if keepSpontaneous==true -0976 %Not the most comprise way to delete reactions without genes, but this -0977 %makes the code easier to understand. Firstly the non-spontaneous -0978 %reactions without genes are removed. After that, the second deletion -0979 %step removes spontaneous reactions, which had gene associations before -0980 %HMM search, but no longer have after it -0981 fprintf('Removing non-spontaneous reactions which after HMM search no longer have GPR rules... '); -0982 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); -0983 model=removeReactions(model,I,true,true); -0984 I=~any(model.rxnGeneMat,2)&ismember(model.rxns,spontRxnsWithGenes); -0985 model=removeReactions(model,I,true,true); -0986 else -0987 %Just simply check for any new reactions without genes and remove -0988 %it -0989 fprintf('Removing reactions which after HMM search no longer have GPR rules... '); -0990 I=~any(model.rxnGeneMat,2); -0991 model=removeReactions(model,I,true,true); -0992 end -0993 fprintf('COMPLETE\n'); -0994 -0995 fprintf('Constructing GPR rules and finalizing the model... '); -0996 %Add the gene associations as 'or' -0997 for i=1:numel(model.rxns) -0998 %Find the involved genes -0999 I=find(model.rxnGeneMat(i,:)); -1000 if any(I) -1001 model.grRules{i}=['(' model.genes{I(1)}]; -1002 for j=2:numel(I) -1003 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; -1004 end -1005 model.grRules{i}=[model.grRules{i} ')']; -1006 end +0555 tmpFile=tempname; +0556 %On Windows, paths need to be translated to Unix before parsing it to WSL +0557 if ispc +0558 wslPath.tmpFile=getWSLpath(tmpFile); +0559 %mafft has problems writing to terminal (/dev/stderr) when running +0560 %on WSL via MATLAB, instead write and read progress file +0561 mafftOutput = tempname; +0562 wslPath.mafftOutput=getWSLpath(mafftOutput); +0563 wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); +0564 wslPath.hmmbuild=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild')); +0565 wslPath.hmmsearch=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch')); +0566 wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); +0567 end +0568 +0569 %Check if alignment of FASTA files should be performed +0570 missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); +0571 if ~isempty(missingAligned) +0572 if seqIdentity==-1 +0573 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); +0574 else +0575 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); +0576 end +0577 missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); +0578 +0579 for i=1:numel(missingAligned) +0580 %This is checked here because it could be that it is created by a +0581 %parallel process. The faw-files are saved as temporary files to +0582 %kept track of which files are being worked on +0583 if ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])) &&... +0584 ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.fa'])) +0585 %Check that the multi-FASTA file exists. It should do so since +0586 %we are saving empty files as well. Print a warning and +0587 %continue if not +0588 if ~isfile(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])) +0589 EM=['WARNING: The multi-FASTA file for ' missingAligned{i} ' does not exist']; +0590 dispEM(EM,false); +0591 continue; +0592 end +0593 +0594 %If the multi-FASTA file is empty then save an empty aligned +0595 %file and continue +0596 s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); +0597 if s.bytes<=0 +0598 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'w'); +0599 fclose(fid); +0600 continue; +0601 end +0602 +0603 %Create an empty file to prevent other threads to start to work +0604 %on the same alignment +0605 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); +0606 fclose(fid); +0607 +0608 %First load the FASTA file, then select up to nSequences +0609 %sequences of the most closely related species, apply any +0610 %constraints from maxPhylDist, and save it as a temporary file, +0611 %and create the model from that +0612 +0613 fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); +0614 phylDist=inf(numel(fastaStruct),1); +0615 for j=1:numel(fastaStruct) +0616 %Get the organism abbreviation +0617 index=strfind(fastaStruct(j).Header,':'); +0618 if any(index) +0619 abbrev=fastaStruct(j).Header(1:index(1)-1); +0620 [~, index]=ismember(abbrev,phylDistStruct.ids); +0621 if any(index) +0622 phylDist(j)=phylDistStruct.distMat(index(1),phylDistId); +0623 end +0624 end +0625 end +0626 +0627 %Inf means that it should not be included +0628 phylDist(phylDist>maxPhylDist)=[]; +0629 +0630 %Sort based on phylDist +0631 [~, order]=sort(phylDist); +0632 +0633 %Save the first nSequences hits to a temporary FASTA file +0634 if nSequences<=numel(fastaStruct) +0635 fastaStruct=fastaStruct(order(1:nSequences)); +0636 else +0637 fastaStruct=fastaStruct(order); +0638 end +0639 +0640 %Do the clustering and alignment if there are more than one +0641 %sequences, otherwise just save the sequence (or an empty file) +0642 if numel(fastaStruct)>1 +0643 if seqIdentity~=-1 +0644 cdhitInpCustom=tempname; +0645 fastawrite(cdhitInpCustom,fastaStruct); +0646 if seqIdentity<=1 && seqIdentity>0.7 +0647 nparam='5'; +0648 elseif seqIdentity>0.6 +0649 nparam='4'; +0650 elseif seqIdentity>0.5 +0651 nparam='3'; +0652 elseif seqIdentity>0.4 +0653 nparam='2'; +0654 else +0655 EM='The provided seqIdentity must be between 0 and 1\n'; +0656 dispEM(EM); +0657 end +0658 if ispc +0659 wslPath.cdhitInpCustom=getWSLpath(cdhitInpCustom); +0660 [status, output]=system(['wsl "' wslPath.cdhit '" -T "' num2str(cores) '" -i "' wslPath.cdhitInpCustom '" -o "' wslPath.tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); +0661 elseif ismac || isunix +0662 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); +0663 end +0664 if status~=0 +0665 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0666 dispEM(EM); +0667 end +0668 %Remove the old tempfile +0669 if exist(cdhitInpCustom, 'file') +0670 delete([cdhitInpCustom '*']); +0671 end +0672 else +0673 %This means that CD-HIT should be skipped since +0674 %seqIdentity is equal to -1 +0675 fastawrite(tmpFile,fastaStruct); +0676 end +0677 %Do the alignment for this file +0678 if ismac +0679 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-mac','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0680 elseif isunix +0681 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0682 elseif ispc +0683 wslPath.fawFile=getWSLpath(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])); +0684 [status, ~]=system(['wsl "' wslPath.mafft '" --auto --anysymbol --progress "' wslPath.mafftOutput '" --thread "' num2str(cores) '" --out "' wslPath.fawFile '" "' wslPath.tmpFile '"']); +0685 output=fileread(mafftOutput); +0686 delete(mafftOutput); +0687 end +0688 if status~=0 +0689 %It could be that alignment failed because only one +0690 %sequence was left after clustering. If that is the +0691 %case, then the clustered file is just copied as 'faw' +0692 %file +0693 if any(regexp(output,'Only 1 sequence found')) +0694 movefile(tmpFile,fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'f'); +0695 else +0696 EM=['Error when performing alignment of ' missingAligned{i} ':\n' output]; +0697 dispEM(EM); +0698 end +0699 end +0700 %Remove the old tempfile +0701 if exist(tmpFile, 'file') +0702 delete([tmpFile '*']); +0703 end +0704 else +0705 %If there is only one sequence then it's not possible to do +0706 %a multiple alignment. Just print the sequence instead. An +0707 %empty file was written previously so that doesn't have to +0708 %be dealt with +0709 if numel(fastaStruct)==1 +0710 warnState = warning; %Save the current warning state +0711 warning('off','Bioinfo:fastawrite:AppendToFile'); +0712 fastawrite(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fastaStruct); +0713 warning(warnState) %Reset warning state to previous settings +0714 end +0715 end +0716 %Move the temporary file to the real one +0717 movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); +0718 +0719 %Print the progress every 25 files +0720 if rem(i-1,25) == 0 +0721 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns))); +0722 progress=pad(progress,3,'left'); +0723 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0724 end +0725 end +0726 end +0727 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0728 else +0729 if seqIdentity==-1 +0730 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); +0731 else +0732 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); +0733 end +0734 end +0735 +0736 %Check if training of Hidden Markov models should be performed +0737 missingHMMs=setdiff(KOModel.rxns,[hmmFiles;outFiles]); +0738 if ~isempty(missingHMMs) +0739 fprintf('Training the KEGG Orthology specific HMMs... 0%% complete'); +0740 missingHMMs=missingHMMs(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingHMMs))); +0741 %Train models for all missing KOs +0742 for i=1:numel(missingHMMs) +0743 %This is checked here because it could be that it is created by a +0744 %parallel process +0745 if ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])) && ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])) +0746 %Check that the aligned FASTA file exists. It could be that it +0747 %is still being worked on by some other instance of the program +0748 %(the .faw file should then exist). This should not happen on a +0749 %single computer. It doesn't throw an error, because it should +0750 %finalize the ones it can +0751 if ~isfile(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])) +0752 EM=['The aligned FASTA file for ' missingHMMs{i} ' does not exist']; +0753 dispEM(EM,false); +0754 continue; +0755 end +0756 +0757 %If the multi-FASTA file is empty then save an empty aligned +0758 %file and continue +0759 s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); +0760 if s.bytes<=0 +0761 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'w'); +0762 fclose(fid); +0763 continue; +0764 end +0765 %Create a temporary file to indicate that it is working on the +0766 %KO. This is because hmmbuild cannot overwrite existing files +0767 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); +0768 fclose(fid); +0769 +0770 %Create HMM +0771 if ismac || isunix +0772 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); +0773 else +0774 wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])); +0775 wslPath.alignFile = getWSLpath(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); +0776 [status, output] = system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.alignFile '"']); +0777 end +0778 if status~=0 +0779 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; +0780 dispEM(EM); +0781 end +0782 +0783 %Delete the temporary file +0784 delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); +0785 +0786 %Print the progress every 25 files +0787 if rem(i-1,25) == 0 +0788 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'hmms','*.hmm')))/numel(KOModel.rxns))); +0789 progress=pad(progress,3,'left'); +0790 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0791 end +0792 end +0793 end +0794 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0795 else +0796 fprintf('Training the KEGG Orthology specific HMMs... COMPLETE\n'); +0797 end +0798 +0799 %Check which new .out files that should be generated. Check if training of +0800 %Hidden Markov models should be performed +0801 missingOUT=setdiff(KOModel.rxns,outFiles); +0802 if ~isempty(missingOUT) +0803 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... 0%% complete'); +0804 missingOUT=missingOUT(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingOUT))); +0805 for i=1:numel(missingOUT) +0806 %This is checked here because it could be that it is created by a +0807 %parallel process +0808 if ~isfile(fullfile(outDir,[missingOUT{i} '.out'])) +0809 %Check that the HMM file exists. It should do so since %we are +0810 %saving empty files as well. Print a warning and continue if +0811 %not +0812 if ~isfile(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])) +0813 EM=['The HMM file for ' missingOUT{i} ' does not exist']; +0814 dispEM(EM,false); +0815 continue; +0816 end +0817 +0818 %Save an empty file to prevent several threads working on the +0819 %same file +0820 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); +0821 fclose(fid); +0822 +0823 %If the HMM file is empty then save an out file and continue +0824 s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); +0825 if s.bytes<=0 +0826 continue; +0827 end +0828 +0829 %Check each gene in the input file against this model +0830 if ismac || isunix +0831 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); +0832 else +0833 wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); +0834 wslPath.fastaFile = getWSLpath(fastaFile); +0835 [status, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.fastaFile '"']); +0836 end +0837 if status~=0 +0838 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; +0839 dispEM(EM); +0840 end +0841 +0842 %Save the output to a file +0843 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); +0844 fwrite(fid,output); +0845 fclose(fid); +0846 +0847 %Print the progress every 25 files +0848 if rem(i-1,25) == 0 +0849 progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns))); +0850 progress=pad(progress,3,'left'); +0851 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0852 end +0853 end +0854 end +0855 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0856 else +0857 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... COMPLETE\n'); +0858 end +0859 +0860 +0861 %***Begin retrieving the output and putting together the resulting model +0862 +0863 fprintf('Parsing the HMM search results... '); +0864 %Retrieve matched genes from the HMMs +0865 koGeneMat=zeros(numel(KOModel.rxns),3000); %Make room for 3000 genes +0866 genes=cell(3000,1); +0867 %Store the best score for a gene in a hash list (since it will be searching +0868 %many times) +0869 hTable = java.util.Hashtable; +0870 +0871 geneCounter=0; +0872 for i=1:numel(KOModel.rxns) +0873 if exist(fullfile(outDir,[KOModel.rxns{i} '.out']), 'file') +0874 fid=fopen(fullfile(outDir,[KOModel.rxns{i} '.out']),'r'); +0875 beginMatches=false; +0876 while 1 +0877 %Get the next line +0878 tline = fgetl(fid); +0879 +0880 %Abort at end of file +0881 if ~ischar(tline) +0882 break; +0883 end +0884 +0885 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) +0886 break; +0887 end +0888 +0889 if beginMatches==false +0890 %This is how the listing of matches begins +0891 if any(strfind(tline,'E-value ')) +0892 %Read one more line that is only padding +0893 tline = fgetl(fid); +0894 beginMatches=true; +0895 end +0896 else +0897 %If matches should be read +0898 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) +0899 elements=regexp(tline,' ','split'); +0900 elements=elements(cellfun(@any,elements)); +0901 +0902 %Check if the match is below the treshhold +0903 score=str2double(elements{1}); +0904 gene=elements{9}; +0905 if score<=cutOff +0906 %If the score is exactly 0, change it to a very +0907 %small value to avoid NaN +0908 if score==0 +0909 score=10^-250; +0910 end +0911 %Check if the gene is added already and, is so, get +0912 %the best score for it +0913 I=hTable.get(gene); +0914 if any(I) +0915 koGeneMat(i,I)=score; +0916 else +0917 geneCounter=geneCounter+1; +0918 %The gene was not present yet so add it +0919 hTable.put(gene,geneCounter); +0920 genes{geneCounter}=gene; +0921 koGeneMat(i,geneCounter)=score; +0922 end +0923 end +0924 else +0925 break; +0926 end +0927 end +0928 end +0929 fclose(fid); +0930 end +0931 end +0932 fprintf('COMPLETE\n'); +0933 +0934 fprintf('Removing gene, KEGG Orthology associations below minScoreRatioKO, minScoreRatioG... '); +0935 koGeneMat=koGeneMat(:,1:geneCounter); +0936 +0937 %Remove the genes for each KO that are below minScoreRatioKO. +0938 for i=1:size(koGeneMat,1) +0939 J=find(koGeneMat(i,:)); +0940 if any(J) +0941 koGeneMat(i,J(log(koGeneMat(i,J))/log(min(koGeneMat(i,J)))<minScoreRatioKO))=0; +0942 end +0943 end +0944 +0945 %Remove the KOs for each gene that are below minScoreRatioG +0946 for i=1:size(koGeneMat,2) +0947 J=find(koGeneMat(:,i)); +0948 if any(J) +0949 koGeneMat(J(log(koGeneMat(J,i))/log(min(koGeneMat(J,i)))<minScoreRatioG),i)=0; +0950 end +0951 end +0952 fprintf('COMPLETE\n'); +0953 +0954 fprintf('Adding gene annotations to the model... '); +0955 %Create the new model +0956 model.genes=genes(1:geneCounter); +0957 model.grRules=cell(numel(model.rxns),1); +0958 model.grRules(:)={''}; +0959 model.rxnGeneMat=sparse(numel(model.rxns),numel(model.genes)); +0960 +0961 %Loop through the reactions and add the corresponding genes +0962 for i=1:numel(model.rxns) +0963 if isstruct(model.rxnMiriams{i}) +0964 %Get all KOs +0965 I=find(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology')); +0966 KOs=model.rxnMiriams{i}.value(I); +0967 %Find the KOs and the corresponding genes +0968 J=ismember(KOModel.rxns,KOs); +0969 [~, K]=find(koGeneMat(J,:)); +0970 +0971 if any(K) +0972 model.rxnGeneMat(i,K)=1; +0973 %Also delete KOs for which no genes were found. If no genes at +0974 %all were matched to the reaction it will be deleted later +0975 L=sum(koGeneMat(J,:),2)==0; +0976 model.rxnMiriams{i}.value(I(L))=[]; +0977 model.rxnMiriams{i}.name(I(L))=[]; +0978 end +0979 end +0980 end +0981 fprintf('COMPLETE\n'); +0982 +0983 %Find and delete all reactions without genes. This also removes genes that +0984 %are not used (which could happen because minScoreRatioG and +0985 %minScoreRatioKO). If keepSpontaneous==true, the spontaneous reactions +0986 %without genes are kept in the model. Spontaneous reactions with original +0987 %gene associations are treated in the same way, like the rest of the +0988 %reactions - if gene associations were removed during HMM search, such +0989 %reactions are deleted from the model +0990 if keepSpontaneous==true +0991 %Not the most comprise way to delete reactions without genes, but this +0992 %makes the code easier to understand. Firstly the non-spontaneous +0993 %reactions without genes are removed. After that, the second deletion +0994 %step removes spontaneous reactions, which had gene associations before +0995 %HMM search, but no longer have after it +0996 fprintf('Removing non-spontaneous reactions which after HMM search no longer have GPR rules... '); +0997 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); +0998 model=removeReactions(model,I,true,true); +0999 I=~any(model.rxnGeneMat,2)&ismember(model.rxns,spontRxnsWithGenes); +1000 model=removeReactions(model,I,true,true); +1001 else +1002 %Just simply check for any new reactions without genes and remove +1003 %it +1004 fprintf('Removing reactions which after HMM search no longer have GPR rules... '); +1005 I=~any(model.rxnGeneMat,2); +1006 model=removeReactions(model,I,true,true); 1007 end -1008 -1009 %Fix grRules and reconstruct rxnGeneMat -1010 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Give detailed output -1011 model.grRules = grRules; -1012 model.rxnGeneMat = rxnGeneMat; -1013 -1014 %Fix subsystems -1015 emptySubSystems=cellfun(@isempty, model.subSystems); -1016 model.subSystems(emptySubSystems)={{''}}; -1017 -1018 %Add the description to the reactions -1019 for i=1:numel(model.rxns) -1020 if ~isempty(model.rxnNotes{i}) -1021 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (using HMMs).',model.rxnNotes(i)); -1022 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); -1023 else -1024 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (using HMMs)'}; -1025 end -1026 end -1027 %Remove the temp fasta file -1028 delete(fastaFile) -1029 fprintf('COMPLETE\n\n*** Model reconstruction complete ***\n'); -1030 end -1031 -1032 function files=listFiles(directory) -1033 %Supporter function to list the files in a directory and return them as a -1034 %cell array -1035 temp=dir(directory); -1036 files=cell(numel(temp),1); -1037 for i=1:numel(temp) -1038 files{i}=temp(i,1).name; -1039 end -1040 files=strrep(files,'.fa',''); -1041 files=strrep(files,'.hmm',''); -1042 files=strrep(files,'.out',''); -1043 files=strrep(files,'.faw',''); -1044 end +1008 fprintf('COMPLETE\n'); +1009 +1010 fprintf('Constructing GPR rules and finalizing the model... '); +1011 %Add the gene associations as 'or' +1012 for i=1:numel(model.rxns) +1013 %Find the involved genes +1014 I=find(model.rxnGeneMat(i,:)); +1015 if any(I) +1016 model.grRules{i}=['(' model.genes{I(1)}]; +1017 for j=2:numel(I) +1018 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; +1019 end +1020 model.grRules{i}=[model.grRules{i} ')']; +1021 end +1022 end +1023 +1024 %Fix grRules and reconstruct rxnGeneMat +1025 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Give detailed output +1026 model.grRules = grRules; +1027 model.rxnGeneMat = rxnGeneMat; +1028 +1029 %Fix subsystems +1030 emptySubSystems=cellfun(@isempty, model.subSystems); +1031 model.subSystems(emptySubSystems)={{''}}; +1032 +1033 %Add the description to the reactions +1034 for i=1:numel(model.rxns) +1035 if ~isempty(model.rxnNotes{i}) +1036 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (using HMMs).',model.rxnNotes(i)); +1037 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); +1038 else +1039 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (using HMMs)'}; +1040 end +1041 end +1042 %Remove the temp fasta file +1043 delete(fastaFile) +1044 fprintf('COMPLETE\n\n*** Model reconstruction complete ***\n'); +1045 end +1046 +1047 function files=listFiles(directory) +1048 %Supporter function to list the files in a directory and return them as a +1049 %cell array +1050 temp=dir(directory); +1051 files=cell(numel(temp),1); +1052 for i=1:numel(temp) +1053 files{i}=temp(i,1).name; +1054 end +1055 files=strrep(files,'.fa',''); +1056 files=strrep(files,'.hmm',''); +1057 files=strrep(files,'.out',''); +1058 files=strrep(files,'.faw',''); +1059 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/external/kegg/getModelFromKEGG.html b/doc/external/kegg/getModelFromKEGG.html index 4649eab5..ae4dd1a3 100644 --- a/doc/external/kegg/getModelFromKEGG.html +++ b/doc/external/kegg/getModelFromKEGG.html @@ -132,249 +132,250 @@

SOURCE CODE ^% Usage: [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,... 0045 % keepUndefinedStoich,keepIncomplete,keepGeneral) 0046 -0047 if nargin<1 -0048 keggPath='RAVEN/external/kegg'; -0049 else -0050 keggPath=char(keggPath); -0051 end -0052 if nargin<2 -0053 keepSpontaneous=true; -0054 end -0055 if nargin<3 -0056 keepUndefinedStoich=true; -0057 end -0058 if nargin<4 -0059 keepIncomplete=true; -0060 end -0061 if nargin<5 -0062 keepGeneral=false; -0063 end -0064 -0065 ravenPath=findRAVENroot(); -0066 modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat'); -0067 if exist(modelFile, 'file') && isNewestFile(ravenPath) -0068 fprintf(['Importing the global KEGG model from ' strrep(modelFile,'\','/') '... ']); -0069 load(modelFile); -0070 fprintf('COMPLETE\n'); -0071 else -0072 fprintf(['NOTE: The file ' strrep(modelFile,'\','/') ' does not exist or is out-of-date and therefore will be (re)generated\n']); -0073 %First get all reactions -0074 [model,isSpontaneous,isUndefinedStoich,isIncomplete,isGeneral]=getRxnsFromKEGG(keggPath); -0075 -0076 %Get the KO ids that are associated with any of the reactions. They -0077 %will be used later on to create a rxn-gene matrix -0078 KOs=cell(numel(model.rxns)*2,1); -0079 %Make room for two KO ids per reaction -0080 -0081 addToIndex=1; -0082 %Add all KO ids that are used -0083 for i=1:numel(model.rxns) -0084 if isstruct(model.rxnMiriams{i}) -0085 for j=1:numel(model.rxnMiriams{i}.name) -0086 if strcmpi('kegg.orthology',model.rxnMiriams{i}.name{j}) -0087 %Add the KO id -0088 KOs(addToIndex)=model.rxnMiriams{i}.value(j); -0089 addToIndex=addToIndex+1; -0090 end -0091 end -0092 end -0093 end -0094 -0095 KOs=KOs(1:addToIndex-1); -0096 KOs=unique(KOs); -0097 -0098 %Get all genes from any organism in KEGG that is associated with any of -0099 %the KOs -0100 KOModel=getGenesFromKEGG(keggPath,KOs); -0101 -0102 fprintf('Pruning the global KEGG model from the partially annotated, lumped KEGG Orthology entries... ') -0103 model.genes=KOModel.genes; -0104 -0105 %It can be that there are KOs from the reactions that have no database -0106 %entry. These are (as far as I have seen) lumped versions of other KOs -0107 %and should be removed -0108 KOsToRemove=setdiff(KOs, KOModel.rxns); -0109 -0110 %Loop through all reactions and delete the KOs that were not found -0111 for i=1:numel(model.rxns) -0112 if isstruct(model.rxnMiriams{i}) -0113 for j=1:numel(model.rxnMiriams{i}.name) -0114 toDel=[]; -0115 if strcmp(model.rxnMiriams{i}.name{j},'kegg.orthology') -0116 if ismember(model.rxnMiriams{i}.value{j},KOsToRemove) -0117 toDel=[toDel;j]; -0118 end -0119 end -0120 end -0121 %Delete the KOs -0122 if any(toDel) -0123 %If all posts are deleted, then delete the whole structure -0124 if numel(toDel)==j -0125 model.rxnMiriams{i}=[]; -0126 else -0127 model.rxnMiriams{i}.name(toDel)=[]; -0128 model.rxnMiriams{i}.value(toDel)=[]; -0129 end -0130 end -0131 end -0132 end -0133 fprintf('COMPLETE\n'); -0134 -0135 fprintf('Constructing the rxnGeneMat for the global KEGG model... 0%% complete'); -0136 %Create the rxnGeneMat for the reactions. This is simply done by -0137 %merging the gene associations for all the involved KOs -0138 r=zeros(10000000,1); -0139 %Store the positions since it's slow to write to a sparse array in a -0140 %loop -0141 c=zeros(10000000,1); -0142 counter=1; -0143 for i=1:numel(model.rxns) -0144 if isstruct(model.rxnMiriams{i}) -0145 I=strncmp('kegg.orthology',model.rxnMiriams{i}.name,18); -0146 if any(I) -0147 [J, K]=ismember(model.rxnMiriams{i}.value(I),KOModel.rxns); -0148 %Find all gene indexes that correspond to any of these KOs -0149 [~, L]=find(KOModel.rxnGeneMat(K(J),:)); -0150 if any(L) -0151 %Allocate room for more elements if needed -0152 if counter+numel(L)-1>=numel(r) -0153 r=[r;zeros(numel(r),1)]; -0154 c=[c;zeros(numel(c),1)]; -0155 end -0156 r(counter:counter+numel(L)-1)=ones(numel(L),1)*i; -0157 c(counter:counter+numel(L)-1)=L(:); -0158 counter=counter+numel(L); -0159 end -0160 end -0161 end -0162 if rem(i-1,100) == 0 -0163 progress=pad(num2str(floor(i/numel(model.rxns)*100)),3,'left'); -0164 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0165 end -0166 end -0167 -0168 model.rxnGeneMat=sparse(r(1:counter-1),c(1:counter-1),ones(counter-1,1)); -0169 if size(model.rxnGeneMat,1)~=numel(model.rxns) || size(model.rxnGeneMat,2)~=numel(KOModel.genes) -0170 model.rxnGeneMat(numel(model.rxns),numel(KOModel.genes))=0; -0171 end -0172 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0173 -0174 %Then get all metabolites -0175 metModel=getMetsFromKEGG(keggPath); -0176 -0177 fprintf('Finalizing the global KEGG model... '); -0178 %Add information about all metabolites to the model -0179 [a, b]=ismember(model.mets,metModel.mets); -0180 a=find(a); -0181 b=b(a); -0182 -0183 if ~isfield(model,'metNames') -0184 model.metNames=cell(numel(model.mets),1); -0185 model.metNames(:)={''}; -0186 end -0187 model.metNames(a)=metModel.metNames(b); -0188 -0189 if ~isfield(model,'metFormulas') -0190 model.metFormulas=cell(numel(model.mets),1); -0191 model.metFormulas(:)={''}; -0192 end -0193 model.metFormulas(a)=metModel.metFormulas(b); -0194 -0195 if ~isfield(model,'inchis') -0196 model.inchis=cell(numel(model.mets),1); -0197 model.inchis(:)={''}; -0198 end -0199 model.inchis(a)=metModel.inchis(b); -0200 -0201 if ~isfield(model,'metMiriams') -0202 model.metMiriams=cell(numel(model.mets),1); -0203 end -0204 model.metMiriams(a)=metModel.metMiriams(b); -0205 -0206 %The composition should be loaded from InChIs when available -0207 I=find(~cellfun(@isempty,model.inchis)); -0208 for i=1:numel(I) -0209 S=regexp(model.inchis(I(i)),'/','split'); -0210 S=S{1}; -0211 if numel(S)>=2 -0212 %Don't copy if it doesn't look good -0213 model.metFormulas(I(i))=S(2); -0214 end -0215 end -0216 -0217 %Put all metabolites in one compartment called 's' (for system). This -0218 %is done just to be more compatible with the rest of the code -0219 model.comps={'s'}; -0220 model.compNames={'System'}; -0221 model.metComps=ones(numel(model.mets),1); -0222 -0223 %If reactions with undefined stoichiometry are kept, then the -0224 %corresponding metabolites will have ids such as "(n+1) C00001" and -0225 %their names will be empty. These ids are not valid SBML identifiers -0226 %and are therefore replaced with "undefined1, undefined2...". The -0227 %former ids are stored as the new names -0228 I=find(cellfun(@any,strfind(model.mets,'n')) | cellfun(@any,strfind(model.mets,'m'))); -0229 model.metNames(I)=model.mets(I); -0230 repNums=1:numel(I); -0231 repIDs=strcat('undefined_',cellfun(@num2str,num2cell(repNums(:)),'UniformOutput',false)); -0232 model.mets(I)=repIDs; -0233 -0234 %It could also be that the metabolite names are empty for some other -0235 %reason. In that case, use the ID instead -0236 I=cellfun(@isempty,model.metNames); -0237 model.metNames(I)=model.mets(I); -0238 -0239 %Deafult LB and UB -0240 model.annotation.defaultLB = -1000; -0241 model.annotation.defaultUB = 1000; -0242 -0243 %Save the model structure -0244 save(modelFile,'model','KOModel','isGeneral','isIncomplete','isUndefinedStoich','isSpontaneous'); -0245 fprintf('COMPLETE\n'); -0246 end -0247 -0248 %Delete reactions which are labeled as "incomplete", "erroneous", -0249 %"unclear", "general reaction" or having undefined stoichiometry (depending -0250 %on settings) -0251 if keepSpontaneous==false -0252 model=removeReactions(model,intersect(isSpontaneous,model.rxns),true,true); -0253 end -0254 if keepUndefinedStoich==false -0255 model=removeReactions(model,intersect(isUndefinedStoich,model.rxns),true,true); -0256 end -0257 if keepIncomplete==false -0258 model=removeReactions(model,intersect(isIncomplete,model.rxns),true,true); -0259 end -0260 if keepGeneral==false -0261 model=removeReactions(model,intersect(isGeneral,model.rxns),true,true); -0262 end -0263 -0264 end -0265 -0266 function output = isNewestFile(ravenPath) -0267 %The ad hoc function, which checks whether keggModel.mat is the more -0268 %recently modified than keggRxns.mat, keggGenes.mat and keggRxns.mat -0269 modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat'); -0270 rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat'); -0271 genesFile=fullfile(ravenPath,'external','kegg','keggGenes.mat'); -0272 metsFile=fullfile(ravenPath,'external','kegg','keggMets.mat'); -0273 if (getFileTime(modelFile)>getFileTime(rxnsFile))&&... -0274 (getFileTime(modelFile)>getFileTime(genesFile))&&... -0275 (getFileTime(modelFile)>getFileTime(metsFile)) -0276 output=1; -0277 else -0278 output=0; -0279 end +0047 ravenPath=findRAVENroot(); +0048 +0049 if nargin<1 +0050 keggPath=fullfile(ravenPath,'external','kegg'); +0051 else +0052 keggPath=char(keggPath); +0053 end +0054 if nargin<2 +0055 keepSpontaneous=true; +0056 end +0057 if nargin<3 +0058 keepUndefinedStoich=true; +0059 end +0060 if nargin<4 +0061 keepIncomplete=true; +0062 end +0063 if nargin<5 +0064 keepGeneral=false; +0065 end +0066 +0067 modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat'); +0068 if exist(modelFile, 'file') && isNewestFile(ravenPath) +0069 fprintf(['Importing the global KEGG model from ' strrep(modelFile,'\','/') '... ']); +0070 load(modelFile); +0071 fprintf('COMPLETE\n'); +0072 else +0073 fprintf(['NOTE: The file ' strrep(modelFile,'\','/') ' does not exist or is out-of-date and therefore will be (re)generated\n']); +0074 %First get all reactions +0075 [model,isSpontaneous,isUndefinedStoich,isIncomplete,isGeneral]=getRxnsFromKEGG(keggPath); +0076 +0077 %Get the KO ids that are associated with any of the reactions. They +0078 %will be used later on to create a rxn-gene matrix +0079 KOs=cell(numel(model.rxns)*2,1); +0080 %Make room for two KO ids per reaction +0081 +0082 addToIndex=1; +0083 %Add all KO ids that are used +0084 for i=1:numel(model.rxns) +0085 if isstruct(model.rxnMiriams{i}) +0086 for j=1:numel(model.rxnMiriams{i}.name) +0087 if strcmpi('kegg.orthology',model.rxnMiriams{i}.name{j}) +0088 %Add the KO id +0089 KOs(addToIndex)=model.rxnMiriams{i}.value(j); +0090 addToIndex=addToIndex+1; +0091 end +0092 end +0093 end +0094 end +0095 +0096 KOs=KOs(1:addToIndex-1); +0097 KOs=unique(KOs); +0098 +0099 %Get all genes from any organism in KEGG that is associated with any of +0100 %the KOs +0101 KOModel=getGenesFromKEGG(keggPath,KOs); +0102 +0103 fprintf('Pruning the global KEGG model from the partially annotated, lumped KEGG Orthology entries... ') +0104 model.genes=KOModel.genes; +0105 +0106 %It can be that there are KOs from the reactions that have no database +0107 %entry. These are (as far as I have seen) lumped versions of other KOs +0108 %and should be removed +0109 KOsToRemove=setdiff(KOs, KOModel.rxns); +0110 +0111 %Loop through all reactions and delete the KOs that were not found +0112 for i=1:numel(model.rxns) +0113 if isstruct(model.rxnMiriams{i}) +0114 for j=1:numel(model.rxnMiriams{i}.name) +0115 toDel=[]; +0116 if strcmp(model.rxnMiriams{i}.name{j},'kegg.orthology') +0117 if ismember(model.rxnMiriams{i}.value{j},KOsToRemove) +0118 toDel=[toDel;j]; +0119 end +0120 end +0121 end +0122 %Delete the KOs +0123 if any(toDel) +0124 %If all posts are deleted, then delete the whole structure +0125 if numel(toDel)==j +0126 model.rxnMiriams{i}=[]; +0127 else +0128 model.rxnMiriams{i}.name(toDel)=[]; +0129 model.rxnMiriams{i}.value(toDel)=[]; +0130 end +0131 end +0132 end +0133 end +0134 fprintf('COMPLETE\n'); +0135 +0136 fprintf('Constructing the rxnGeneMat for the global KEGG model... 0%% complete'); +0137 %Create the rxnGeneMat for the reactions. This is simply done by +0138 %merging the gene associations for all the involved KOs +0139 r=zeros(10000000,1); +0140 %Store the positions since it's slow to write to a sparse array in a +0141 %loop +0142 c=zeros(10000000,1); +0143 counter=1; +0144 for i=1:numel(model.rxns) +0145 if isstruct(model.rxnMiriams{i}) +0146 I=strncmp('kegg.orthology',model.rxnMiriams{i}.name,18); +0147 if any(I) +0148 [J, K]=ismember(model.rxnMiriams{i}.value(I),KOModel.rxns); +0149 %Find all gene indexes that correspond to any of these KOs +0150 [~, L]=find(KOModel.rxnGeneMat(K(J),:)); +0151 if any(L) +0152 %Allocate room for more elements if needed +0153 if counter+numel(L)-1>=numel(r) +0154 r=[r;zeros(numel(r),1)]; +0155 c=[c;zeros(numel(c),1)]; +0156 end +0157 r(counter:counter+numel(L)-1)=ones(numel(L),1)*i; +0158 c(counter:counter+numel(L)-1)=L(:); +0159 counter=counter+numel(L); +0160 end +0161 end +0162 end +0163 if rem(i-1,100) == 0 +0164 progress=pad(num2str(floor(i/numel(model.rxns)*100)),3,'left'); +0165 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0166 end +0167 end +0168 +0169 model.rxnGeneMat=sparse(r(1:counter-1),c(1:counter-1),ones(counter-1,1)); +0170 if size(model.rxnGeneMat,1)~=numel(model.rxns) || size(model.rxnGeneMat,2)~=numel(KOModel.genes) +0171 model.rxnGeneMat(numel(model.rxns),numel(KOModel.genes))=0; +0172 end +0173 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0174 +0175 %Then get all metabolites +0176 metModel=getMetsFromKEGG(keggPath); +0177 +0178 fprintf('Finalizing the global KEGG model... '); +0179 %Add information about all metabolites to the model +0180 [a, b]=ismember(model.mets,metModel.mets); +0181 a=find(a); +0182 b=b(a); +0183 +0184 if ~isfield(model,'metNames') +0185 model.metNames=cell(numel(model.mets),1); +0186 model.metNames(:)={''}; +0187 end +0188 model.metNames(a)=metModel.metNames(b); +0189 +0190 if ~isfield(model,'metFormulas') +0191 model.metFormulas=cell(numel(model.mets),1); +0192 model.metFormulas(:)={''}; +0193 end +0194 model.metFormulas(a)=metModel.metFormulas(b); +0195 +0196 if ~isfield(model,'inchis') +0197 model.inchis=cell(numel(model.mets),1); +0198 model.inchis(:)={''}; +0199 end +0200 model.inchis(a)=metModel.inchis(b); +0201 +0202 if ~isfield(model,'metMiriams') +0203 model.metMiriams=cell(numel(model.mets),1); +0204 end +0205 model.metMiriams(a)=metModel.metMiriams(b); +0206 +0207 %The composition should be loaded from InChIs when available +0208 I=find(~cellfun(@isempty,model.inchis)); +0209 for i=1:numel(I) +0210 S=regexp(model.inchis(I(i)),'/','split'); +0211 S=S{1}; +0212 if numel(S)>=2 +0213 %Don't copy if it doesn't look good +0214 model.metFormulas(I(i))=S(2); +0215 end +0216 end +0217 +0218 %Put all metabolites in one compartment called 's' (for system). This +0219 %is done just to be more compatible with the rest of the code +0220 model.comps={'s'}; +0221 model.compNames={'System'}; +0222 model.metComps=ones(numel(model.mets),1); +0223 +0224 %If reactions with undefined stoichiometry are kept, then the +0225 %corresponding metabolites will have ids such as "(n+1) C00001" and +0226 %their names will be empty. These ids are not valid SBML identifiers +0227 %and are therefore replaced with "undefined1, undefined2...". The +0228 %former ids are stored as the new names +0229 I=find(cellfun(@any,strfind(model.mets,'n')) | cellfun(@any,strfind(model.mets,'m'))); +0230 model.metNames(I)=model.mets(I); +0231 repNums=1:numel(I); +0232 repIDs=strcat('undefined_',cellfun(@num2str,num2cell(repNums(:)),'UniformOutput',false)); +0233 model.mets(I)=repIDs; +0234 +0235 %It could also be that the metabolite names are empty for some other +0236 %reason. In that case, use the ID instead +0237 I=cellfun(@isempty,model.metNames); +0238 model.metNames(I)=model.mets(I); +0239 +0240 %Deafult LB and UB +0241 model.annotation.defaultLB = -1000; +0242 model.annotation.defaultUB = 1000; +0243 +0244 %Save the model structure +0245 save(modelFile,'model','KOModel','isGeneral','isIncomplete','isUndefinedStoich','isSpontaneous'); +0246 fprintf('COMPLETE\n'); +0247 end +0248 +0249 %Delete reactions which are labeled as "incomplete", "erroneous", +0250 %"unclear", "general reaction" or having undefined stoichiometry (depending +0251 %on settings) +0252 if keepSpontaneous==false +0253 model=removeReactions(model,intersect(isSpontaneous,model.rxns),true,true); +0254 end +0255 if keepUndefinedStoich==false +0256 model=removeReactions(model,intersect(isUndefinedStoich,model.rxns),true,true); +0257 end +0258 if keepIncomplete==false +0259 model=removeReactions(model,intersect(isIncomplete,model.rxns),true,true); +0260 end +0261 if keepGeneral==false +0262 model=removeReactions(model,intersect(isGeneral,model.rxns),true,true); +0263 end +0264 +0265 end +0266 +0267 function output = isNewestFile(ravenPath) +0268 %The ad hoc function, which checks whether keggModel.mat is the more +0269 %recently modified than keggRxns.mat, keggGenes.mat and keggRxns.mat +0270 modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat'); +0271 rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat'); +0272 genesFile=fullfile(ravenPath,'external','kegg','keggGenes.mat'); +0273 metsFile=fullfile(ravenPath,'external','kegg','keggMets.mat'); +0274 if (getFileTime(modelFile)>getFileTime(rxnsFile))&&... +0275 (getFileTime(modelFile)>getFileTime(genesFile))&&... +0276 (getFileTime(modelFile)>getFileTime(metsFile)) +0277 output=1; +0278 else +0279 output=0; 0280 end -0281 -0282 function modTime = getFileTime(fileName) -0283 %Gets a last modification time for a particular file in datenum format that -0284 %the numbers could be easily compared for different files -0285 listing = dir(fileName); -0286 assert(numel(listing) == 1, 'No such file: %s', fileName); -0287 modTime = listing.datenum; -0288 format long; -0289 end +0281 end +0282 +0283 function modTime = getFileTime(fileName) +0284 %Gets a last modification time for a particular file in datenum format that +0285 %the numbers could be easily compared for different files +0286 listing = dir(fileName); +0287 assert(numel(listing) == 1, 'No such file: %s', fileName); +0288 modTime = listing.datenum; +0289 format long; +0290 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/external/kegg/getRxnsFromKEGG.html b/doc/external/kegg/getRxnsFromKEGG.html index 11fc7455..603c3d76 100644 --- a/doc/external/kegg/getRxnsFromKEGG.html +++ b/doc/external/kegg/getRxnsFromKEGG.html @@ -177,435 +177,436 @@

SOURCE CODE ^% (except for '///') 0069 % 0070 -0071 if nargin<1 -0072 keggPath='RAVEN/external/kegg'; -0073 else -0074 keggPath=char(keggPath); -0075 end -0076 -0077 %Check if the reactions have been parsed before and saved. If so, load the -0078 %model -0079 ravenPath=findRAVENroot(); -0080 rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat'); -0081 if exist(rxnsFile, 'file') -0082 fprintf(['Importing KEGG reactions from ' strrep(rxnsFile,'\','/') '... ']); -0083 load(rxnsFile); -0084 else -0085 fprintf(['NOTE: Cannot locate ' strrep(rxnsFile,'\','/') ', it will therefore be generated from the local KEGG database\n']); -0086 if ~isfile(fullfile(keggPath,'reaction')) || ~isfile(fullfile(keggPath,'reaction.lst')) || ~isfile(fullfile(keggPath,'reaction_mapformula.lst')) -0087 EM=fprintf(['The files ''reaction'', ''reaction.lst'' and ''reaction_mapformula.lst'' cannot be located at ' strrep(keggPath,'\','/') '/ and should be downloaded from the KEGG FTP\n']); -0088 dispEM(EM); -0089 else -0090 fprintf('Generating keggRxns.mat file... '); -0091 %Add new functionality in the order specified in models -0092 model.id='KEGG'; -0093 model.name='Automatically generated from KEGG database'; -0094 -0095 %Preallocate memory for 15000 reactions -0096 model.rxns=cell(15000,1); -0097 model.rxnNames=cell(15000,1); -0098 model.eccodes=cell(15000,1); -0099 model.subSystems=cell(15000,1); -0100 model.rxnMiriams=cell(15000,1); -0101 model.rxnNotes=cell(15000,1); -0102 equations=cell(15000,1); -0103 %Temporarily store the equations -0104 -0105 isSpontaneous=false(15000,1); -0106 isIncomplete=false(15000,1); -0107 isGeneral=false(15000,1); -0108 -0109 %First load information on reaction ID, reaction name, KO, pathway, -0110 %and ec-number -0111 fid = fopen(fullfile(keggPath,'reaction'), 'r'); -0112 -0113 %Keep track of how many reactions have been added -0114 rxnCounter=0; -0115 -0116 %Loop through the file -0117 orthology=false; -0118 pathway=false; -0119 module=false; -0120 while 1 -0121 %Get the next line -0122 tline = fgetl(fid); -0123 -0124 %Abort at end of file -0125 if ~ischar(tline) -0126 break; -0127 end -0128 -0129 %Skip '///' -0130 if numel(tline)<12 -0131 continue; -0132 end -0133 -0134 %Check if it's a new reaction -0135 if strcmp(tline(1:12),'ENTRY ') -0136 rxnCounter=rxnCounter+1; -0137 -0138 %Add empty strings where there should be such -0139 model.rxnNames{rxnCounter}=''; -0140 model.eccodes{rxnCounter}=''; -0141 %model.subSystems{rxnCounter}=''; %remain empty cell -0142 model.rxnNotes{rxnCounter}=''; -0143 equations{rxnCounter}=''; -0144 -0145 %Add reaction ID (always 6 characters) -0146 model.rxns{rxnCounter}=tline(13:18); -0147 orthology=false; -0148 pathway=false; -0149 module=false; -0150 -0151 %Add KEGG reaction ID miriam -0152 tempStruct=model.rxnMiriams{rxnCounter}; -0153 tempStruct.name{1,1}='kegg.reaction'; -0154 tempStruct.value{1,1}=tline(13:18); -0155 model.rxnMiriams{rxnCounter}=tempStruct; -0156 end -0157 -0158 %Add name -0159 if strcmp(tline(1:12),'NAME ') -0160 model.rxnNames{rxnCounter}=tline(13:end); -0161 end -0162 -0163 %Add whether the comment includes "incomplete", "erroneous" or -0164 %"unclear" -0165 if strcmp(tline(1:12),'COMMENT ') -0166 %Read all text until '///', 'RPAIR', 'ENZYME', 'PATHWAY' or 'RCLASS' -0167 commentText=tline(13:end); -0168 while 1 -0169 tline = fgetl(fid); -0170 if ~strcmp(tline(1:3),'///') && ~strcmp(tline(1:3),'RPA') && ~strcmp(tline(1:3),'ENZ') && ~strcmp(tline(1:3),'PAT') && ~strcmp(tline(1:3),'RCL') -0171 commentText=[commentText ' ' strtrim(tline)]; -0172 else -0173 break; -0174 end -0175 end -0176 if any(regexpi(commentText,'SPONTANEOUS')) -0177 %It should start this way -0178 isSpontaneous(rxnCounter)=true; -0179 end -0180 if any(regexpi(commentText,'INCOMPLETE')) || any(regexpi(commentText,'ERRONEOUS')) || any(regexpi(commentText,'UNCLEAR')) -0181 isIncomplete(rxnCounter)=true; -0182 end -0183 if any(regexpi(commentText,'GENERAL REACTION')) -0184 %It should start this way -0185 isGeneral(rxnCounter)=true; -0186 end -0187 -0188 %Go to next iteration if it is '///' -0189 if numel(tline)<12 -0190 continue; -0191 end -0192 end -0193 -0194 %Add ec-number -0195 if strcmp(tline(1:12),'ENZYME ') -0196 model.eccodes{rxnCounter}=tline(13:end); -0197 model.eccodes{rxnCounter}=deblank(model.eccodes{rxnCounter}); -0198 model.eccodes{rxnCounter}=regexprep(model.eccodes{rxnCounter},'\s+',';'); -0199 end -0200 if numel(tline)>8 -0201 if strcmp(tline(1:9),'REFERENCE') -0202 pathway=false; -0203 orthology=false; -0204 end -0205 end -0206 -0207 %Add module ids -0208 if numel(tline)>18 -0209 if strcmp(tline(1:12),'MODULE ') || module==true -0210 pathway=false; -0211 orthology=false; -0212 if isstruct(model.rxnMiriams{rxnCounter}) -0213 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; -0214 else -0215 addToIndex=1; -0216 end -0217 tempStruct=model.rxnMiriams{rxnCounter}; -0218 tempStruct.name{addToIndex,1}='kegg.module'; -0219 tempStruct.value{addToIndex,1}=tline(13:18); -0220 model.rxnMiriams{rxnCounter}=tempStruct; -0221 end -0222 end -0223 -0224 %Add RHEA id -0225 if numel(tline)>18 -0226 if strcmp(tline(1:18),'DBLINKS RHEA: ') -0227 pathway=false; -0228 orthology=false; -0229 module=false; -0230 if isstruct(model.rxnMiriams{rxnCounter}) -0231 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; -0232 else -0233 addToIndex=1; -0234 end -0235 tempStruct=model.rxnMiriams{rxnCounter}; -0236 tempStruct.name{addToIndex,1}='rhea'; -0237 tempStruct.value{addToIndex,1}=tline(19:end); -0238 model.rxnMiriams{rxnCounter}=tempStruct; -0239 end -0240 end -0241 -0242 %Add KO-ids -0243 if numel(tline)>16 -0244 if strcmp(tline(1:16),'ORTHOLOGY KO: ') || strcmp(tline(1:16),' KO: ') || strcmp(tline(1:12),'ORTHOLOGY ') || orthology==true -0245 pathway=false; -0246 module=false; -0247 %Check if KO has been added already (each reaction may -0248 %belong to several) -0249 if isstruct(model.rxnMiriams{rxnCounter}) -0250 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; -0251 else -0252 addToIndex=1; -0253 end -0254 -0255 tempStruct=model.rxnMiriams{rxnCounter}; -0256 tempStruct.name{addToIndex,1}='kegg.orthology'; -0257 if strcmp(tline(13:16),'KO:') -0258 %This is in the old version -0259 tempStruct.value{addToIndex,1}=tline(17:22); -0260 else -0261 %This means that it found one KO in the new format -0262 %and that subsequent lines might be other KOs -0263 orthology=true; -0264 tempStruct.value{addToIndex,1}=tline(13:18); -0265 end -0266 model.rxnMiriams{rxnCounter}=tempStruct; -0267 end -0268 end -0269 -0270 %Add pathways -0271 if numel(tline)>18 -0272 if strcmp(tline(1:18),'PATHWAY PATH: ') || strcmp(tline(1:18),' PATH: ') || strcmp(tline(1:12),'PATHWAY ') || pathway==true -0273 orthology=false; -0274 module=false; -0275 %Check if annotation has been added already -0276 if isstruct(model.rxnMiriams{rxnCounter}) -0277 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; -0278 else -0279 addToIndex=1; -0280 end -0281 -0282 tempStruct=model.rxnMiriams{rxnCounter}; -0283 tempStruct.name{addToIndex,1}='kegg.pathway'; -0284 %If it is the old version -0285 if strcmp(tline(14:17),'PATH:') -0286 tempStruct.value{addToIndex,1}=tline(19:25); -0287 else -0288 %If it is the new version -0289 tempStruct.value{addToIndex,1}=tline(13:19); -0290 pathway=true; -0291 end -0292 -0293 %Do not save global or overview pathways. The ids for -0294 %such pathways begin with rn011 or rn012 -0295 if ~strcmp('rn011',tempStruct.value{addToIndex,1}(1:5)) && ~strcmp('rn012',tempStruct.value{addToIndex,1}(1:5)) -0296 model.rxnMiriams{rxnCounter}=tempStruct; -0297 -0298 %Also save the subSystems names. For the old KEGG -0299 %format, only the first mentioned subsystem is -0300 %picked. Use the newer KEGG format to fetch all the -0301 %subsystems -0302 if strcmp(tline(14:17),'PATH:') -0303 %The old format -0304 model.subSystems{rxnCounter}=tline(28:end); -0305 else -0306 %The new format -0307 model.subSystems{rxnCounter,1}{1,numel(model.subSystems{rxnCounter,1})+1}=tline(22:end); -0308 end -0309 end -0310 end -0311 end -0312 end -0313 -0314 %Close the file -0315 fclose(fid); -0316 -0317 %This is done here since the the indexes won't match since some -0318 %reactions are removed along the way -0319 isIncomplete=model.rxns(isIncomplete); -0320 isGeneral=model.rxns(isGeneral); -0321 isSpontaneous=model.rxns(isSpontaneous); -0322 -0323 %If too much space was allocated, shrink the model -0324 model.rxns=model.rxns(1:rxnCounter); -0325 model.rxnNames=model.rxnNames(1:rxnCounter); -0326 model.eccodes=model.eccodes(1:rxnCounter); -0327 equations=equations(1:rxnCounter); -0328 model.rxnMiriams=model.rxnMiriams(1:rxnCounter); -0329 model.rxnNotes=model.rxnNotes(1:rxnCounter); -0330 model.subSystems=model.subSystems(1:rxnCounter); -0331 -0332 %Then load the equations from another file. This is because the -0333 %equations are easier to retrieve from there -0334 -0335 %The format is rxnID: equation The reactions should have been -0336 %loaded in the exact same order -0337 fid = fopen(fullfile(keggPath,'reaction.lst'), 'r'); -0338 -0339 %Loop through the file -0340 for i=1:rxnCounter -0341 %Get the next line -0342 tline = fgetl(fid); -0343 -0344 equations{i}=tline(9:end); -0345 end -0346 -0347 %Close the file -0348 fclose(fid); -0349 -0350 %Several equations may have two whitespaces between the last -0351 %reactant and the reversible arrow sign. The number of whitespaces -0352 %is thus reduced to one -0353 equations = regexprep(equations,' <=>', ' <=>'); -0354 -0355 %Construct the S matrix and list of metabolites -0356 [S, mets, badRxns]=constructS(equations); -0357 model.S=S; -0358 model.mets=mets; -0359 -0360 %There is some limited evidence for directionality in -0361 %reaction_mapformula.lst. The information there concerns how the -0362 %reactions are drawn in the KEGG maps. If a reaction is -0363 %irreversible in the same direction for all maps, then I consider -0364 %is irreversible, otherwise reversible. Also, not all reactions are -0365 %present in the maps, so not all will have directionality. They -0366 %will be considered to be reversible -0367 -0368 %The format is R00005: 00330: C01010 => C00011 Generate a -0369 %reversibility structure with the fields: *rxns: reaction ids -0370 %*product: one met id that is a product. This is because the -0371 %*reactions might be written in another direction compared to in -0372 % the reactions.lst file -0373 %*rev: 1 if reversible, otherwise 0 -0374 reversibility.rxns={}; -0375 reversibility.product={}; -0376 reversibility.rev=[]; -0377 -0378 fid = fopen(fullfile(keggPath,'reaction_mapformula.lst'), 'r'); -0379 while 1 -0380 %Get the next line -0381 tline = fgetl(fid); -0382 -0383 %Abort at end of file -0384 if ~ischar(tline) -0385 break; -0386 end -0387 -0388 rxn=tline(1:6); -0389 prod=tline(end-5:end); -0390 rev=any(strfind(tline,'<=>')); -0391 if isempty(reversibility.rxns) -0392 reversibility.rxns{1}=rxn; -0393 reversibility.product{1}=prod; -0394 reversibility.rev(1)=rev; -0395 elseif strcmp(reversibility.rxns(end),rxn) -0396 %Check if the reaction was added before. It's an ordered -0397 %list, so only check the last element If it's reversible in -0398 %the new reaction or reversible in the old reaction then -0399 %set (keep) to be reversible -0400 if rev==1 || reversibility.rev(end)==1 -0401 reversibility.rev(end)=1; -0402 else -0403 %This means that the reaction was already loaded, that -0404 %it was irreversible before and irreversible in the new -0405 %reaction. However, it could be that they are written -0406 %in diferent directions. If the product differ, then -0407 %set to be reversible. This assumes that the reactions -0408 %are written with the same metabolite as the last one -0409 %if they are in the same direction -0410 if ~strcmp(prod,reversibility.product(end)) -0411 reversibility.rev(end)=1; -0412 end -0413 end -0414 else -0415 reversibility.rxns=[reversibility.rxns;rxn]; -0416 reversibility.product=[reversibility.product;prod]; -0417 reversibility.rev=[reversibility.rev;rev]; -0418 end -0419 end -0420 fclose(fid); -0421 -0422 %Update the reversibility -0423 model.rev=ones(rxnCounter,1); -0424 %Match the reaction ids -0425 irrevIDs=find(reversibility.rev==0); -0426 [~, I]=ismember(reversibility.rxns(irrevIDs),model.rxns); -0427 [~, prodMetIDs]=ismember(reversibility.product(irrevIDs),model.mets); -0428 model.rev(I)=0; -0429 -0430 %See if the reactions are written in the same order in model.S -0431 linearInd=sub2ind(size(model.S), prodMetIDs, I); -0432 changeOrder=I(model.S(linearInd)<0); -0433 %Change the order of these reactions -0434 model.S(:,changeOrder)=model.S(:,changeOrder).*-1; -0435 -0436 %Add some stuff to get a correct model structure -0437 model.ub=ones(rxnCounter,1)*1000; -0438 model.lb=model.rev*-1000; -0439 model.c=zeros(rxnCounter,1); -0440 model.b=zeros(numel(model.mets),1); -0441 model=removeReactions(model,badRxns,true,true); -0442 -0443 %Identify reactions with undefined stoichiometry. Such -0444 %reactions involve metabolites with an ID containing the letter "n" -0445 %or "m" -0446 I=cellfun(@any,strfind(model.mets,'n')) | cellfun(@any,strfind(model.mets,'m')); -0447 [~, J]=find(model.S(I,:)); -0448 isUndefinedStoich=model.rxns(unique(J)); -0449 %Sort model that metabolites with undefined stoichiometry would -0450 %appear in the end of metabolites list -0451 metList=[model.mets(~I);model.mets(I)]; -0452 [~,metIndexes]=ismember(metList,model.mets); -0453 model=permuteModel(model,metIndexes,'mets'); -0454 -0455 %Sort model that i) spontaneous, ii) with undefined -0456 %stoichiometry, iii) incomplete and iv) general reactions would bve -0457 %ranked in the end of the model -0458 endRxnList=unique([model.rxns(ismember(model.rxns,isSpontaneous));model.rxns(ismember(model.rxns,isUndefinedStoich));model.rxns(ismember(model.rxns,isIncomplete));model.rxns(ismember(model.rxns,isGeneral))],'stable'); -0459 rxnList=[model.rxns(~ismember(model.rxns,endRxnList));endRxnList]; -0460 [~,rxnIndexes]=ismember(rxnList,model.rxns); -0461 model=permuteModel(model,rxnIndexes,'rxns'); -0462 -0463 %Add information in rxnNotes, whether reaction belongs to any of -0464 %type i-iv mentioned a few lines above -0465 for i=(numel(rxnList)-numel(endRxnList)+1):numel(model.rxns) -0466 if ismember(model.rxns(i),isSpontaneous) -0467 model.rxnNotes(i)=strcat(model.rxnNotes(i),'Spontaneous'); -0468 end -0469 if ismember(model.rxns(i),isUndefinedStoich) -0470 if isempty(model.rxnNotes{i}) -0471 model.rxnNotes(i)=strcat(model.rxnNotes(i),'With undefined stoichiometry'); -0472 else -0473 model.rxnNotes(i)=strcat(model.rxnNotes(i),', with undefined stoichiometry'); -0474 end -0475 end -0476 if ismember(model.rxns(i),isIncomplete) -0477 if isempty(model.rxnNotes{i}) -0478 model.rxnNotes(i)=strcat(model.rxnNotes(i),'Incomplete'); -0479 else -0480 model.rxnNotes(i)=strcat(model.rxnNotes(i),', incomplete'); -0481 end -0482 end -0483 if ismember(model.rxns(i),isGeneral) -0484 if isempty(model.rxnNotes{i}) -0485 model.rxnNotes(i)=strcat(model.rxnNotes(i),'General'); -0486 else -0487 model.rxnNotes(i)=strcat(model.rxnNotes(i),', general'); -0488 end -0489 end -0490 model.rxnNotes(i)=strcat(model.rxnNotes(i),' reaction'); -0491 end -0492 -0493 %Save the model structure -0494 save(rxnsFile,'model','isGeneral','isIncomplete','isUndefinedStoich','isSpontaneous'); -0495 end -0496 end -0497 fprintf('COMPLETE\n'); -0498 -0499 end +0071 ravenPath=findRAVENroot(); +0072 +0073 if nargin<1 +0074 keggPath=fullfile(ravenPath,'external','kegg'); +0075 else +0076 keggPath=char(keggPath); +0077 end +0078 +0079 %Check if the reactions have been parsed before and saved. If so, load the +0080 %model +0081 rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat'); +0082 if exist(rxnsFile, 'file') +0083 fprintf(['Importing KEGG reactions from ' strrep(rxnsFile,'\','/') '... ']); +0084 load(rxnsFile); +0085 else +0086 fprintf(['NOTE: Cannot locate ' strrep(rxnsFile,'\','/') ', it will therefore be generated from the local KEGG database\n']); +0087 if ~isfile(fullfile(keggPath,'reaction')) || ~isfile(fullfile(keggPath,'reaction.lst')) || ~isfile(fullfile(keggPath,'reaction_mapformula.lst')) +0088 EM=fprintf(['The files ''reaction'', ''reaction.lst'' and ''reaction_mapformula.lst'' cannot be located at ' strrep(keggPath,'\','/') '/ and should be downloaded from the KEGG FTP\n']); +0089 dispEM(EM); +0090 else +0091 fprintf('Generating keggRxns.mat file... '); +0092 %Add new functionality in the order specified in models +0093 model.id='KEGG'; +0094 model.name='Automatically generated from KEGG database'; +0095 +0096 %Preallocate memory for 15000 reactions +0097 model.rxns=cell(15000,1); +0098 model.rxnNames=cell(15000,1); +0099 model.eccodes=cell(15000,1); +0100 model.subSystems=cell(15000,1); +0101 model.rxnMiriams=cell(15000,1); +0102 model.rxnNotes=cell(15000,1); +0103 equations=cell(15000,1); +0104 %Temporarily store the equations +0105 +0106 isSpontaneous=false(15000,1); +0107 isIncomplete=false(15000,1); +0108 isGeneral=false(15000,1); +0109 +0110 %First load information on reaction ID, reaction name, KO, pathway, +0111 %and ec-number +0112 fid = fopen(fullfile(keggPath,'reaction'), 'r'); +0113 +0114 %Keep track of how many reactions have been added +0115 rxnCounter=0; +0116 +0117 %Loop through the file +0118 orthology=false; +0119 pathway=false; +0120 module=false; +0121 while 1 +0122 %Get the next line +0123 tline = fgetl(fid); +0124 +0125 %Abort at end of file +0126 if ~ischar(tline) +0127 break; +0128 end +0129 +0130 %Skip '///' +0131 if numel(tline)<12 +0132 continue; +0133 end +0134 +0135 %Check if it's a new reaction +0136 if strcmp(tline(1:12),'ENTRY ') +0137 rxnCounter=rxnCounter+1; +0138 +0139 %Add empty strings where there should be such +0140 model.rxnNames{rxnCounter}=''; +0141 model.eccodes{rxnCounter}=''; +0142 %model.subSystems{rxnCounter}=''; %remain empty cell +0143 model.rxnNotes{rxnCounter}=''; +0144 equations{rxnCounter}=''; +0145 +0146 %Add reaction ID (always 6 characters) +0147 model.rxns{rxnCounter}=tline(13:18); +0148 orthology=false; +0149 pathway=false; +0150 module=false; +0151 +0152 %Add KEGG reaction ID miriam +0153 tempStruct=model.rxnMiriams{rxnCounter}; +0154 tempStruct.name{1,1}='kegg.reaction'; +0155 tempStruct.value{1,1}=tline(13:18); +0156 model.rxnMiriams{rxnCounter}=tempStruct; +0157 end +0158 +0159 %Add name +0160 if strcmp(tline(1:12),'NAME ') +0161 model.rxnNames{rxnCounter}=tline(13:end); +0162 end +0163 +0164 %Add whether the comment includes "incomplete", "erroneous" or +0165 %"unclear" +0166 if strcmp(tline(1:12),'COMMENT ') +0167 %Read all text until '///', 'RPAIR', 'ENZYME', 'PATHWAY' or 'RCLASS' +0168 commentText=tline(13:end); +0169 while 1 +0170 tline = fgetl(fid); +0171 if ~strcmp(tline(1:3),'///') && ~strcmp(tline(1:3),'RPA') && ~strcmp(tline(1:3),'ENZ') && ~strcmp(tline(1:3),'PAT') && ~strcmp(tline(1:3),'RCL') +0172 commentText=[commentText ' ' strtrim(tline)]; +0173 else +0174 break; +0175 end +0176 end +0177 if any(regexpi(commentText,'SPONTANEOUS')) +0178 %It should start this way +0179 isSpontaneous(rxnCounter)=true; +0180 end +0181 if any(regexpi(commentText,'INCOMPLETE')) || any(regexpi(commentText,'ERRONEOUS')) || any(regexpi(commentText,'UNCLEAR')) +0182 isIncomplete(rxnCounter)=true; +0183 end +0184 if any(regexpi(commentText,'GENERAL REACTION')) +0185 %It should start this way +0186 isGeneral(rxnCounter)=true; +0187 end +0188 +0189 %Go to next iteration if it is '///' +0190 if numel(tline)<12 +0191 continue; +0192 end +0193 end +0194 +0195 %Add ec-number +0196 if strcmp(tline(1:12),'ENZYME ') +0197 model.eccodes{rxnCounter}=tline(13:end); +0198 model.eccodes{rxnCounter}=deblank(model.eccodes{rxnCounter}); +0199 model.eccodes{rxnCounter}=regexprep(model.eccodes{rxnCounter},'\s+',';'); +0200 end +0201 if numel(tline)>8 +0202 if strcmp(tline(1:9),'REFERENCE') +0203 pathway=false; +0204 orthology=false; +0205 end +0206 end +0207 +0208 %Add module ids +0209 if numel(tline)>18 +0210 if strcmp(tline(1:12),'MODULE ') || module==true +0211 pathway=false; +0212 orthology=false; +0213 if isstruct(model.rxnMiriams{rxnCounter}) +0214 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; +0215 else +0216 addToIndex=1; +0217 end +0218 tempStruct=model.rxnMiriams{rxnCounter}; +0219 tempStruct.name{addToIndex,1}='kegg.module'; +0220 tempStruct.value{addToIndex,1}=tline(13:18); +0221 model.rxnMiriams{rxnCounter}=tempStruct; +0222 end +0223 end +0224 +0225 %Add RHEA id +0226 if numel(tline)>18 +0227 if strcmp(tline(1:18),'DBLINKS RHEA: ') +0228 pathway=false; +0229 orthology=false; +0230 module=false; +0231 if isstruct(model.rxnMiriams{rxnCounter}) +0232 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; +0233 else +0234 addToIndex=1; +0235 end +0236 tempStruct=model.rxnMiriams{rxnCounter}; +0237 tempStruct.name{addToIndex,1}='rhea'; +0238 tempStruct.value{addToIndex,1}=tline(19:end); +0239 model.rxnMiriams{rxnCounter}=tempStruct; +0240 end +0241 end +0242 +0243 %Add KO-ids +0244 if numel(tline)>16 +0245 if strcmp(tline(1:16),'ORTHOLOGY KO: ') || strcmp(tline(1:16),' KO: ') || strcmp(tline(1:12),'ORTHOLOGY ') || orthology==true +0246 pathway=false; +0247 module=false; +0248 %Check if KO has been added already (each reaction may +0249 %belong to several) +0250 if isstruct(model.rxnMiriams{rxnCounter}) +0251 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; +0252 else +0253 addToIndex=1; +0254 end +0255 +0256 tempStruct=model.rxnMiriams{rxnCounter}; +0257 tempStruct.name{addToIndex,1}='kegg.orthology'; +0258 if strcmp(tline(13:16),'KO:') +0259 %This is in the old version +0260 tempStruct.value{addToIndex,1}=tline(17:22); +0261 else +0262 %This means that it found one KO in the new format +0263 %and that subsequent lines might be other KOs +0264 orthology=true; +0265 tempStruct.value{addToIndex,1}=tline(13:18); +0266 end +0267 model.rxnMiriams{rxnCounter}=tempStruct; +0268 end +0269 end +0270 +0271 %Add pathways +0272 if numel(tline)>18 +0273 if strcmp(tline(1:18),'PATHWAY PATH: ') || strcmp(tline(1:18),' PATH: ') || strcmp(tline(1:12),'PATHWAY ') || pathway==true +0274 orthology=false; +0275 module=false; +0276 %Check if annotation has been added already +0277 if isstruct(model.rxnMiriams{rxnCounter}) +0278 addToIndex=numel(model.rxnMiriams{rxnCounter}.name)+1; +0279 else +0280 addToIndex=1; +0281 end +0282 +0283 tempStruct=model.rxnMiriams{rxnCounter}; +0284 tempStruct.name{addToIndex,1}='kegg.pathway'; +0285 %If it is the old version +0286 if strcmp(tline(14:17),'PATH:') +0287 tempStruct.value{addToIndex,1}=tline(19:25); +0288 else +0289 %If it is the new version +0290 tempStruct.value{addToIndex,1}=tline(13:19); +0291 pathway=true; +0292 end +0293 +0294 %Do not save global or overview pathways. The ids for +0295 %such pathways begin with rn011 or rn012 +0296 if ~strcmp('rn011',tempStruct.value{addToIndex,1}(1:5)) && ~strcmp('rn012',tempStruct.value{addToIndex,1}(1:5)) +0297 model.rxnMiriams{rxnCounter}=tempStruct; +0298 +0299 %Also save the subSystems names. For the old KEGG +0300 %format, only the first mentioned subsystem is +0301 %picked. Use the newer KEGG format to fetch all the +0302 %subsystems +0303 if strcmp(tline(14:17),'PATH:') +0304 %The old format +0305 model.subSystems{rxnCounter}=tline(28:end); +0306 else +0307 %The new format +0308 model.subSystems{rxnCounter,1}{1,numel(model.subSystems{rxnCounter,1})+1}=tline(22:end); +0309 end +0310 end +0311 end +0312 end +0313 end +0314 +0315 %Close the file +0316 fclose(fid); +0317 +0318 %This is done here since the the indexes won't match since some +0319 %reactions are removed along the way +0320 isIncomplete=model.rxns(isIncomplete); +0321 isGeneral=model.rxns(isGeneral); +0322 isSpontaneous=model.rxns(isSpontaneous); +0323 +0324 %If too much space was allocated, shrink the model +0325 model.rxns=model.rxns(1:rxnCounter); +0326 model.rxnNames=model.rxnNames(1:rxnCounter); +0327 model.eccodes=model.eccodes(1:rxnCounter); +0328 equations=equations(1:rxnCounter); +0329 model.rxnMiriams=model.rxnMiriams(1:rxnCounter); +0330 model.rxnNotes=model.rxnNotes(1:rxnCounter); +0331 model.subSystems=model.subSystems(1:rxnCounter); +0332 +0333 %Then load the equations from another file. This is because the +0334 %equations are easier to retrieve from there +0335 +0336 %The format is rxnID: equation The reactions should have been +0337 %loaded in the exact same order +0338 fid = fopen(fullfile(keggPath,'reaction.lst'), 'r'); +0339 +0340 %Loop through the file +0341 for i=1:rxnCounter +0342 %Get the next line +0343 tline = fgetl(fid); +0344 +0345 equations{i}=tline(9:end); +0346 end +0347 +0348 %Close the file +0349 fclose(fid); +0350 +0351 %Several equations may have two whitespaces between the last +0352 %reactant and the reversible arrow sign. The number of whitespaces +0353 %is thus reduced to one +0354 equations = regexprep(equations,' <=>', ' <=>'); +0355 +0356 %Construct the S matrix and list of metabolites +0357 [S, mets, badRxns]=constructS(equations); +0358 model.S=S; +0359 model.mets=mets; +0360 +0361 %There is some limited evidence for directionality in +0362 %reaction_mapformula.lst. The information there concerns how the +0363 %reactions are drawn in the KEGG maps. If a reaction is +0364 %irreversible in the same direction for all maps, then I consider +0365 %is irreversible, otherwise reversible. Also, not all reactions are +0366 %present in the maps, so not all will have directionality. They +0367 %will be considered to be reversible +0368 +0369 %The format is R00005: 00330: C01010 => C00011 Generate a +0370 %reversibility structure with the fields: *rxns: reaction ids +0371 %*product: one met id that is a product. This is because the +0372 %*reactions might be written in another direction compared to in +0373 % the reactions.lst file +0374 %*rev: 1 if reversible, otherwise 0 +0375 reversibility.rxns={}; +0376 reversibility.product={}; +0377 reversibility.rev=[]; +0378 +0379 fid = fopen(fullfile(keggPath,'reaction_mapformula.lst'), 'r'); +0380 while 1 +0381 %Get the next line +0382 tline = fgetl(fid); +0383 +0384 %Abort at end of file +0385 if ~ischar(tline) +0386 break; +0387 end +0388 +0389 rxn=tline(1:6); +0390 prod=tline(end-5:end); +0391 rev=any(strfind(tline,'<=>')); +0392 if isempty(reversibility.rxns) +0393 reversibility.rxns{1}=rxn; +0394 reversibility.product{1}=prod; +0395 reversibility.rev(1)=rev; +0396 elseif strcmp(reversibility.rxns(end),rxn) +0397 %Check if the reaction was added before. It's an ordered +0398 %list, so only check the last element If it's reversible in +0399 %the new reaction or reversible in the old reaction then +0400 %set (keep) to be reversible +0401 if rev==1 || reversibility.rev(end)==1 +0402 reversibility.rev(end)=1; +0403 else +0404 %This means that the reaction was already loaded, that +0405 %it was irreversible before and irreversible in the new +0406 %reaction. However, it could be that they are written +0407 %in diferent directions. If the product differ, then +0408 %set to be reversible. This assumes that the reactions +0409 %are written with the same metabolite as the last one +0410 %if they are in the same direction +0411 if ~strcmp(prod,reversibility.product(end)) +0412 reversibility.rev(end)=1; +0413 end +0414 end +0415 else +0416 reversibility.rxns=[reversibility.rxns;rxn]; +0417 reversibility.product=[reversibility.product;prod]; +0418 reversibility.rev=[reversibility.rev;rev]; +0419 end +0420 end +0421 fclose(fid); +0422 +0423 %Update the reversibility +0424 model.rev=ones(rxnCounter,1); +0425 %Match the reaction ids +0426 irrevIDs=find(reversibility.rev==0); +0427 [~, I]=ismember(reversibility.rxns(irrevIDs),model.rxns); +0428 [~, prodMetIDs]=ismember(reversibility.product(irrevIDs),model.mets); +0429 model.rev(I)=0; +0430 +0431 %See if the reactions are written in the same order in model.S +0432 linearInd=sub2ind(size(model.S), prodMetIDs, I); +0433 changeOrder=I(model.S(linearInd)<0); +0434 %Change the order of these reactions +0435 model.S(:,changeOrder)=model.S(:,changeOrder).*-1; +0436 +0437 %Add some stuff to get a correct model structure +0438 model.ub=ones(rxnCounter,1)*1000; +0439 model.lb=model.rev*-1000; +0440 model.c=zeros(rxnCounter,1); +0441 model.b=zeros(numel(model.mets),1); +0442 model=removeReactions(model,badRxns,true,true); +0443 +0444 %Identify reactions with undefined stoichiometry. Such +0445 %reactions involve metabolites with an ID containing the letter "n" +0446 %or "m" +0447 I=cellfun(@any,strfind(model.mets,'n')) | cellfun(@any,strfind(model.mets,'m')); +0448 [~, J]=find(model.S(I,:)); +0449 isUndefinedStoich=model.rxns(unique(J)); +0450 %Sort model that metabolites with undefined stoichiometry would +0451 %appear in the end of metabolites list +0452 metList=[model.mets(~I);model.mets(I)]; +0453 [~,metIndexes]=ismember(metList,model.mets); +0454 model=permuteModel(model,metIndexes,'mets'); +0455 +0456 %Sort model that i) spontaneous, ii) with undefined +0457 %stoichiometry, iii) incomplete and iv) general reactions would bve +0458 %ranked in the end of the model +0459 endRxnList=unique([model.rxns(ismember(model.rxns,isSpontaneous));model.rxns(ismember(model.rxns,isUndefinedStoich));model.rxns(ismember(model.rxns,isIncomplete));model.rxns(ismember(model.rxns,isGeneral))],'stable'); +0460 rxnList=[model.rxns(~ismember(model.rxns,endRxnList));endRxnList]; +0461 [~,rxnIndexes]=ismember(rxnList,model.rxns); +0462 model=permuteModel(model,rxnIndexes,'rxns'); +0463 +0464 %Add information in rxnNotes, whether reaction belongs to any of +0465 %type i-iv mentioned a few lines above +0466 for i=(numel(rxnList)-numel(endRxnList)+1):numel(model.rxns) +0467 if ismember(model.rxns(i),isSpontaneous) +0468 model.rxnNotes(i)=strcat(model.rxnNotes(i),'Spontaneous'); +0469 end +0470 if ismember(model.rxns(i),isUndefinedStoich) +0471 if isempty(model.rxnNotes{i}) +0472 model.rxnNotes(i)=strcat(model.rxnNotes(i),'With undefined stoichiometry'); +0473 else +0474 model.rxnNotes(i)=strcat(model.rxnNotes(i),', with undefined stoichiometry'); +0475 end +0476 end +0477 if ismember(model.rxns(i),isIncomplete) +0478 if isempty(model.rxnNotes{i}) +0479 model.rxnNotes(i)=strcat(model.rxnNotes(i),'Incomplete'); +0480 else +0481 model.rxnNotes(i)=strcat(model.rxnNotes(i),', incomplete'); +0482 end +0483 end +0484 if ismember(model.rxns(i),isGeneral) +0485 if isempty(model.rxnNotes{i}) +0486 model.rxnNotes(i)=strcat(model.rxnNotes(i),'General'); +0487 else +0488 model.rxnNotes(i)=strcat(model.rxnNotes(i),', general'); +0489 end +0490 end +0491 model.rxnNotes(i)=strcat(model.rxnNotes(i),' reaction'); +0492 end +0493 +0494 %Save the model structure +0495 save(rxnsFile,'model','isGeneral','isIncomplete','isUndefinedStoich','isSpontaneous'); +0496 end +0497 end +0498 fprintf('COMPLETE\n'); +0499 +0500 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/io/exportModel.html b/doc/io/exportModel.html index 6367bf1a..f538d90b 100644 --- a/doc/io/exportModel.html +++ b/doc/io/exportModel.html @@ -631,7 +631,7 @@

SOURCE CODE ^'UniformOutput', false), model.subSystems, 'UniformOutput', false); 0569 0570 % === 2) Flatten once: names and their reaction indices (vectorized) === -0571 flatNames = [model.subSystems{:}]; % 1×M cellstr of all subsystem labels +0571 flatNames = vertcat(model.subSystems{:}); % 1×M cellstr of all subsystem labels 0572 if isempty(flatNames) 0573 % Nothing to do: no subsystems present 0574 return diff --git a/doc/testing/unit_tests/hmmerTests.html b/doc/testing/unit_tests/hmmerTests.html index dec29c58..121a6d33 100644 --- a/doc/testing/unit_tests/hmmerTests.html +++ b/doc/testing/unit_tests/hmmerTests.html @@ -68,124 +68,136 @@

SOURCE CODE ^'-completenames'); 0022 ravenPath=fileparts(fileparts(fileparts(ST(I).file))); 0023 -0024 %Identify the operating system -0025 if isunix -0026 if ismac -0027 binEnd='.mac'; -0028 else -0029 binEnd=''; -0030 end -0031 elseif ispc -0032 binEnd='.exe'; -0033 else -0034 dispEM('Unknown OS, exiting.') -0035 return -0036 end -0037 -0038 %Create empty structures needed for HMMER results -0039 actHmmResult.genes = {}; -0040 actHmmResult.scores = []; -0041 -0042 %Create structures that contain expected HMMER results -0043 expHmmResult.genes = {'sp|P41947|MEL6_YEASX','sp|P41946|MEL5_YEASX', 'sp|P41945|MEL2_YEASX', 'sp|P04824|MEL1_YEASX'}; -0044 expHmmResult.scores = [10^-250, 10^-250, 10^-250, 10^-250]; -0045 -0046 %Generate temporary names for working directory and outFile -0047 tmpDIR=tempname; -0048 outFile=tempname; -0049 -0050 %Run HMMER multi-threaded to use all logical cores assigned to MATLAB -0051 cores = evalc('feature(''numcores'')'); -0052 cores = strsplit(cores, 'MATLAB was assigned: '); -0053 cores = regexp(cores{2},'^\d*','match'); -0054 cores = cores{1}; -0055 -0056 %Create a temporary folder and copy multi-FASTA file there -0057 [~, ~]=system(['mkdir "' tmpDIR '"']); -0058 -0059 sourceDir = fileparts(which(mfilename)); -0060 copyfile(fullfile(sourceDir,'test_data','yeast_galactosidases.fa'),tmpDIR); -0061 copyfile(fullfile(sourceDir,'test_data','human_galactosidases.fa'),tmpDIR); -0062 -0063 %% -0064 %Train a hidden Markov model -0065 [~, ~]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +0024 %Generate temporary names for working directory and outFile +0025 tmpDIR=tempname; +0026 outFile=tempname; +0027 +0028 %Create a temporary folder and copy multi-FASTA file there +0029 [~, ~]=system(['mkdir "' tmpDIR '"']); +0030 +0031 sourceDir = fileparts(which(mfilename)); +0032 copyfile(fullfile(sourceDir,'test_data','yeast_galactosidases.fa'),tmpDIR); +0033 copyfile(fullfile(sourceDir,'test_data','human_galactosidases.fa'),tmpDIR); +0034 +0035 +0036 %Identify the operating system +0037 if isunix +0038 if ismac +0039 binEnd='.mac'; +0040 else +0041 binEnd=''; +0042 end +0043 elseif ispc +0044 wslPath.tmpDIR = getWSLpath(tmpDIR); +0045 wslPath.hmmbuild = getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild')); +0046 wslPath.hmmsearch = getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch')); +0047 filesep='/'; +0048 else +0049 dispEM('Unknown OS, exiting.') +0050 return +0051 end +0052 +0053 %Create empty structures needed for HMMER results +0054 actHmmResult.genes = {}; +0055 actHmmResult.scores = []; +0056 +0057 %Create structures that contain expected HMMER results +0058 expHmmResult.genes = {'sp|P41947|MEL6_YEASX','sp|P41946|MEL5_YEASX', 'sp|P41945|MEL2_YEASX', 'sp|P04824|MEL1_YEASX'}; +0059 expHmmResult.scores = [10^-250, 10^-250, 10^-250, 10^-250]; +0060 +0061 %Run HMMER multi-threaded to use all logical cores assigned to MATLAB +0062 cores = evalc('feature(''numcores'')'); +0063 cores = strsplit(cores, 'MATLAB was assigned: '); +0064 cores = regexp(cores{2},'^\d*','match'); +0065 cores = cores{1}; 0066 -0067 %Run a homology search against the newly-trained HMM -0068 [~, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); -0069 -0070 %Save the output to a file -0071 fid=fopen(outFile,'w'); -0072 fwrite(fid,output); -0073 fclose(fid); +0067 %% +0068 %Train a hidden Markov model +0069 if ismac || isunix +0070 [~, ~]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +0071 else +0072 [~, ~]=system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.tmpDIR '/human_galactosidases.hmm" "' wslPath.tmpDIR '/yeast_galactosidases.fa"']); +0073 end 0074 -0075 %% -0076 %Parse the results -0077 geneCounter=0; -0078 fid=fopen(outFile,'r'); -0079 beginMatches=false; -0080 while 1 -0081 %Get the next line -0082 tline = fgetl(fid); -0083 -0084 %Abort at end of file -0085 if ~ischar(tline) -0086 break; -0087 end -0088 -0089 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) -0090 break; -0091 end -0092 -0093 if beginMatches==false -0094 %This is how the listing of matches begins -0095 if any(strfind(tline,'E-value ')) -0096 %Read one more line that is only padding -0097 tline = fgetl(fid); -0098 beginMatches=true; -0099 end -0100 else -0101 %If matches should be read -0102 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) -0103 elements=regexp(tline,' ','split'); -0104 elements=elements(cellfun(@any,elements)); -0105 -0106 %Check if the match is below the treshhold -0107 score=str2double(elements{1}); -0108 gene=elements{9}; -0109 if score<=10^-50 -0110 %If the score is exactly 0, change it to a very -0111 %small value to avoid NaN -0112 if score==0 -0113 score=10^-250; -0114 end -0115 %Check if the gene is added already and, is so, get -0116 %the best score for it -0117 geneCounter=geneCounter+1; -0118 actHmmResult.genes{geneCounter}=gene; -0119 actHmmResult.scores(geneCounter)=score; -0120 end -0121 else -0122 break; -0123 end -0124 end -0125 end -0126 fclose(fid); -0127 -0128 %Remove the old tempfiles -0129 delete([outFile '*']); -0130 -0131 %Remove temporary folder, since testing is finished -0132 [~, ~]=system(['rm "' tmpDIR '" -r']); -0133 -0134 %% -0135 %Test 1a: Check if HMMER results match the expected ones -0136 verifyEqual(testCase,actHmmResult,expHmmResult); -0137 -0138 %Test 1b: Modify actual HMMER results structure and check if test fails -0139 actHmmResult.score(2)=1; -0140 verifyNotEqual(testCase,actHmmResult,expHmmResult); -0141 end +0075 %Run a homology search against the newly-trained HMM +0076 if ismac || isunix +0077 [~, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +0078 else +0079 [~, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.tmpDIR '/human_galactosidases.hmm" "' wslPath.tmpDIR '/yeast_galactosidases.fa"']); +0080 end +0081 +0082 %Save the output to a file +0083 fid=fopen(outFile,'w'); +0084 fwrite(fid,output); +0085 fclose(fid); +0086 +0087 %% +0088 %Parse the results +0089 geneCounter=0; +0090 fid=fopen(outFile,'r'); +0091 beginMatches=false; +0092 while 1 +0093 %Get the next line +0094 tline = fgetl(fid); +0095 +0096 %Abort at end of file +0097 if ~ischar(tline) +0098 break; +0099 end +0100 +0101 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) +0102 break; +0103 end +0104 +0105 if beginMatches==false +0106 %This is how the listing of matches begins +0107 if any(strfind(tline,'E-value ')) +0108 %Read one more line that is only padding +0109 tline = fgetl(fid); +0110 beginMatches=true; +0111 end +0112 else +0113 %If matches should be read +0114 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) +0115 elements=regexp(tline,' ','split'); +0116 elements=elements(cellfun(@any,elements)); +0117 +0118 %Check if the match is below the treshhold +0119 score=str2double(elements{1}); +0120 gene=elements{9}; +0121 if score<=10^-50 +0122 %If the score is exactly 0, change it to a very +0123 %small value to avoid NaN +0124 if score==0 +0125 score=10^-250; +0126 end +0127 %Check if the gene is added already and, is so, get +0128 %the best score for it +0129 geneCounter=geneCounter+1; +0130 actHmmResult.genes{geneCounter}=gene; +0131 actHmmResult.scores(geneCounter)=score; +0132 end +0133 else +0134 break; +0135 end +0136 end +0137 end +0138 fclose(fid); +0139 +0140 %Remove the old tempfiles +0141 delete([outFile '*']); +0142 +0143 %Remove temporary folder, since testing is finished +0144 [~, ~]=system(['rm "' tmpDIR '" -r']); +0145 +0146 %% +0147 %Test 1a: Check if HMMER results match the expected ones +0148 verifyEqual(testCase,actHmmResult,expHmmResult); +0149 +0150 %Test 1b: Modify actual HMMER results structure and check if test fails +0151 actHmmResult.score(2)=1; +0152 verifyNotEqual(testCase,actHmmResult,expHmmResult); +0153 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/tutorial/index.html b/doc/tutorial/index.html index efef9dc9..b1ccd2cc 100644 --- a/doc/tutorial/index.html +++ b/doc/tutorial/index.html @@ -24,9 +24,7 @@

Matlab files in this directory:

Other Matlab-specific files in this directory:

  • empty.mat
  • iMK1208+suppInfo.mat
  • pathway.mat
  • pcPathway.mat
-

Subsequent directories:

-
    -
  • struct_conversion
+
Generated by m2html © 2005
diff --git a/external/kegg/getKEGGModelForOrganism.m b/external/kegg/getKEGGModelForOrganism.m index 24355620..770d9da1 100755 --- a/external/kegg/getKEGGModelForOrganism.m +++ b/external/kegg/getKEGGModelForOrganism.m @@ -38,7 +38,7 @@ % The hidden Markov models as generated in 2b or % downloaded from BioMet Toolbox (see below) % The final directory in dataDir should be styled as -% prok90_kegg105 or euk90_kegg105, indicating whether +% prok90_kegg116 or euk90_kegg116, indicating whether % the HMMs were trained on pro- or eukaryotic % sequences; using which sequence similarity treshold % (first set of digits); using which KEGG version @@ -99,12 +99,12 @@ % If -1 is provided, CD-HIT is skipped (optional, default 0.9) % globalModel structure containing both model and KOModel % structures as generated by getModelFromKEGG. These -% will otherwise be loaded by via getModelFromKEGG. +% will otherwise be loaded by via getModelFromKEGG. % Providing globalKEGGmodel can speed up model % generation if getKEGGModelForOrganism is run % multiple times for different strains. Example: % [globalModel.model,globalModel.KOModel] = getModelFromKEGG; -% (optional, default empty, global model is loaded by +% (optional, default empty, global model is loaded by % getModelFromKEGG) % % Output: @@ -259,36 +259,36 @@ else outDir=char(outDir); end -if nargin<5 +if nargin<5 || isempty(keepSpontaneous) keepSpontaneous=true; end -if nargin<6 +if nargin<6 || isempty(keepUndefinedStoich) keepUndefinedStoich=true; end -if nargin<7 +if nargin<7 || isempty(keepIncomplete) keepIncomplete=true; end -if nargin<8 +if nargin<8 || isempty(keepGeneral) keepGeneral=false; end -if nargin<9 +if nargin<9 || isempty(cutOff) cutOff=10^-50; end -if nargin<10 +if nargin<10 || isempty(minScoreRatioKO) minScoreRatioKO=0.3; end -if nargin<11 +if nargin<11 || isempty(minScoreRatioG) minScoreRatioG=0.8; end -if nargin<12 +if nargin<12 || isempty(maxPhylDist) maxPhylDist=inf; %Include all sequences for each reaction end -if nargin<13 +if nargin<13 || isempty(nSequences) nSequences=inf; %Include all sequences for each reaction end -if nargin<14 +if nargin<14 || isempty(seqIdentity) seqIdentity=0.9; end @@ -315,9 +315,9 @@ %required zip file already in working directory or have it extracted. If %the zip file and directory is not here, it is downloaded from the cloud if ~isempty(dataDir) - hmmOptions={'euk90_kegg105','prok90_kegg105'}; + hmmOptions={'euk90_kegg116','prok90_kegg116'}; if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions. - %If not, then check whether the required folders exist anyway. + %If not, then check whether the required folders exist anyway. if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ... ~isfolder(fullfile(dataDir,'fasta')) && ... ~isfolder(fullfile(dataDir,'aligned')) && ... @@ -339,14 +339,14 @@ else fprintf('Downloading the HMMs archive file... '); try - websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']); + websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.11.0/',hmmOptions{hmmIndex},'.zip']); catch ME if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError') error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues') end end end - + fprintf('COMPLETE\n'); fprintf('Extracting the HMMs archive file... '); unzip([dataDir,'.zip']); @@ -406,7 +406,7 @@ if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'); end - + fprintf(['Pruning the model from non-' organismID ' genes... ']); if ismember(organismID,{'eukaryotes','prokaryotes'}) phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); @@ -552,6 +552,20 @@ return end +tmpFile=tempname; +%On Windows, paths need to be translated to Unix before parsing it to WSL +if ispc + wslPath.tmpFile=getWSLpath(tmpFile); + %mafft has problems writing to terminal (/dev/stderr) when running + %on WSL via MATLAB, instead write and read progress file + mafftOutput = tempname; + wslPath.mafftOutput=getWSLpath(mafftOutput); + wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); + wslPath.hmmbuild=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild')); + wslPath.hmmsearch=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch')); + wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); +end + %Check if alignment of FASTA files should be performed missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); if ~isempty(missingAligned) @@ -561,18 +575,7 @@ fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); end missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); - tmpFile=tempname; - %On Windows, paths need to be translated to Unix before parsing it to WSL - if ispc - wslPath.tmpFile=getWSLpath(tmpFile); - %mafft has problems writing to terminal (/dev/stderr) when running - %on WSL via MATLAB, instead write and read progress file - mafftOutput = tempname; - wslPath.mafftOutput=getWSLpath(mafftOutput); - wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); - wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); - end - + for i=1:numel(missingAligned) %This is checked here because it could be that it is created by a %parallel process. The faw-files are saved as temporary files to @@ -587,7 +590,7 @@ dispEM(EM,false); continue; end - + %If the multi-FASTA file is empty then save an empty aligned %file and continue s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); @@ -596,17 +599,17 @@ fclose(fid); continue; end - + %Create an empty file to prevent other threads to start to work %on the same alignment fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); fclose(fid); - + %First load the FASTA file, then select up to nSequences %sequences of the most closely related species, apply any %constraints from maxPhylDist, and save it as a temporary file, %and create the model from that - + fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); phylDist=inf(numel(fastaStruct),1); for j=1:numel(fastaStruct) @@ -620,24 +623,24 @@ end end end - + %Inf means that it should not be included phylDist(phylDist>maxPhylDist)=[]; - + %Sort based on phylDist [~, order]=sort(phylDist); - + %Save the first nSequences hits to a temporary FASTA file if nSequences<=numel(fastaStruct) fastaStruct=fastaStruct(order(1:nSequences)); else fastaStruct=fastaStruct(order); end - + %Do the clustering and alignment if there are more than one %sequences, otherwise just save the sequence (or an empty file) if numel(fastaStruct)>1 - if seqIdentity~=-1 + if seqIdentity~=-1 cdhitInpCustom=tempname; fastawrite(cdhitInpCustom,fastaStruct); if seqIdentity<=1 && seqIdentity>0.7 @@ -712,7 +715,7 @@ end %Move the temporary file to the real one movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); - + %Print the progress every 25 files if rem(i-1,25) == 0 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns))); @@ -750,7 +753,7 @@ dispEM(EM,false); continue; end - + %If the multi-FASTA file is empty then save an empty aligned %file and continue s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); @@ -763,14 +766,20 @@ %KO. This is because hmmbuild cannot overwrite existing files fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); fclose(fid); - + %Create HMM - [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); + if ismac || isunix + [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); + else + wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])); + wslPath.alignFile = getWSLpath(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); + [status, output] = system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.alignFile '"']); + end if status~=0 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; dispEM(EM); end - + %Delete the temporary file delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); @@ -805,30 +814,36 @@ dispEM(EM,false); continue; end - + %Save an empty file to prevent several threads working on the %same file fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); fclose(fid); - + %If the HMM file is empty then save an out file and continue s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); if s.bytes<=0 continue; end - + %Check each gene in the input file against this model - [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); + if ismac || isunix + [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); + else + wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); + wslPath.fastaFile = getWSLpath(fastaFile); + [status, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.fastaFile '"']); + end if status~=0 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; dispEM(EM); end - + %Save the output to a file fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); fwrite(fid,output); fclose(fid); - + %Print the progress every 25 files if rem(i-1,25) == 0 progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns))); @@ -861,16 +876,16 @@ while 1 %Get the next line tline = fgetl(fid); - + %Abort at end of file if ~ischar(tline) break; end - + if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) break; end - + if beginMatches==false %This is how the listing of matches begins if any(strfind(tline,'E-value ')) @@ -883,7 +898,7 @@ if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) elements=regexp(tline,' ','split'); elements=elements(cellfun(@any,elements)); - + %Check if the match is below the treshhold score=str2double(elements{1}); gene=elements{9}; @@ -952,7 +967,7 @@ %Find the KOs and the corresponding genes J=ismember(KOModel.rxns,KOs); [~, K]=find(koGeneMat(J,:)); - + if any(K) model.rxnGeneMat(i,K)=1; %Also delete KOs for which no genes were found. If no genes at diff --git a/external/kegg/getModelFromKEGG.m b/external/kegg/getModelFromKEGG.m index d9871a9f..9530ba72 100755 --- a/external/kegg/getModelFromKEGG.m +++ b/external/kegg/getModelFromKEGG.m @@ -44,8 +44,10 @@ % Usage: [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,... % keepUndefinedStoich,keepIncomplete,keepGeneral) +ravenPath=findRAVENroot(); + if nargin<1 - keggPath='RAVEN/external/kegg'; + keggPath=fullfile(ravenPath,'external','kegg'); else keggPath=char(keggPath); end @@ -62,7 +64,6 @@ keepGeneral=false; end -ravenPath=findRAVENroot(); modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat'); if exist(modelFile, 'file') && isNewestFile(ravenPath) fprintf(['Importing the global KEGG model from ' strrep(modelFile,'\','/') '... ']); diff --git a/external/kegg/getRxnsFromKEGG.m b/external/kegg/getRxnsFromKEGG.m index eab27d87..0ad96efc 100755 --- a/external/kegg/getRxnsFromKEGG.m +++ b/external/kegg/getRxnsFromKEGG.m @@ -68,15 +68,16 @@ % (except for '///') % +ravenPath=findRAVENroot(); + if nargin<1 - keggPath='RAVEN/external/kegg'; + keggPath=fullfile(ravenPath,'external','kegg'); else keggPath=char(keggPath); end %Check if the reactions have been parsed before and saved. If so, load the %model -ravenPath=findRAVENroot(); rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat'); if exist(rxnsFile, 'file') fprintf(['Importing KEGG reactions from ' strrep(rxnsFile,'\','/') '... ']); diff --git a/external/kegg/keggGenes.mat b/external/kegg/keggGenes.mat index 2456d77a..2494cbe9 100644 Binary files a/external/kegg/keggGenes.mat and b/external/kegg/keggGenes.mat differ diff --git a/external/kegg/keggMets.mat b/external/kegg/keggMets.mat index bbe4a90d..784901c0 100644 Binary files a/external/kegg/keggMets.mat and b/external/kegg/keggMets.mat differ diff --git a/external/kegg/keggPhylDist.mat b/external/kegg/keggPhylDist.mat index 9cdde3b9..0f05bf94 100644 Binary files a/external/kegg/keggPhylDist.mat and b/external/kegg/keggPhylDist.mat differ diff --git a/external/kegg/keggRxns.mat b/external/kegg/keggRxns.mat index 78802a41..b9993055 100644 Binary files a/external/kegg/keggRxns.mat and b/external/kegg/keggRxns.mat differ diff --git a/external/metacyc/metaCycEnzymes.mat b/external/metacyc/metaCycEnzymes.mat index 1caf660f..7b03db05 100644 Binary files a/external/metacyc/metaCycEnzymes.mat and b/external/metacyc/metaCycEnzymes.mat differ diff --git a/external/metacyc/metaCycMets.mat b/external/metacyc/metaCycMets.mat index cb86ac0c..26707d6a 100644 Binary files a/external/metacyc/metaCycMets.mat and b/external/metacyc/metaCycMets.mat differ diff --git a/external/metacyc/metaCycRxns.mat b/external/metacyc/metaCycRxns.mat index 97a069a1..e5c680ce 100644 Binary files a/external/metacyc/metaCycRxns.mat and b/external/metacyc/metaCycRxns.mat differ diff --git a/io/exportModel.m b/io/exportModel.m index ca969c17..27c9bb6f 100755 --- a/io/exportModel.m +++ b/io/exportModel.m @@ -568,7 +568,7 @@ function exportModel(model,fileName,neverPrefix,supressWarnings,sortIds) model.subSystems = cellfun(@(c) cellfun(@char, c, 'UniformOutput', false), model.subSystems, 'UniformOutput', false); % === 2) Flatten once: names and their reaction indices (vectorized) === - flatNames = [model.subSystems{:}]; % 1×M cellstr of all subsystem labels + flatNames = vertcat(model.subSystems{:}); % 1×M cellstr of all subsystem labels if isempty(flatNames) % Nothing to do: no subsystems present return diff --git a/software/blast+/blastp b/software/blast+/blastp index 807f90c2..5b2e6e8e 100755 Binary files a/software/blast+/blastp and b/software/blast+/blastp differ diff --git a/software/blast+/blastp.exe b/software/blast+/blastp.exe index 3927fb36..fadea7fd 100644 Binary files a/software/blast+/blastp.exe and b/software/blast+/blastp.exe differ diff --git a/software/blast+/blastp.mac b/software/blast+/blastp.mac index c94f53aa..9f1e81cc 100755 Binary files a/software/blast+/blastp.mac and b/software/blast+/blastp.mac differ diff --git a/software/blast+/makeblastdb b/software/blast+/makeblastdb index d191a43d..a4e4d9fd 100755 Binary files a/software/blast+/makeblastdb and b/software/blast+/makeblastdb differ diff --git a/software/blast+/makeblastdb.exe b/software/blast+/makeblastdb.exe index 331a2bfb..7866a977 100644 Binary files a/software/blast+/makeblastdb.exe and b/software/blast+/makeblastdb.exe differ diff --git a/software/blast+/makeblastdb.mac b/software/blast+/makeblastdb.mac index 578acabf..c632a8ef 100755 Binary files a/software/blast+/makeblastdb.mac and b/software/blast+/makeblastdb.mac differ diff --git a/software/blast+/nghttp2.dll b/software/blast+/nghttp2.dll index 850efeb8..cb71384f 100644 Binary files a/software/blast+/nghttp2.dll and b/software/blast+/nghttp2.dll differ diff --git a/software/diamond/diamond b/software/diamond/diamond index 546b8e06..db7effdb 100755 Binary files a/software/diamond/diamond and b/software/diamond/diamond differ diff --git a/software/diamond/diamond.exe b/software/diamond/diamond.exe index ad36d58b..df530e04 100644 Binary files a/software/diamond/diamond.exe and b/software/diamond/diamond.exe differ diff --git a/software/diamond/diamond.mac b/software/diamond/diamond.mac index b5e1c833..508d0fd9 100755 Binary files a/software/diamond/diamond.mac and b/software/diamond/diamond.mac differ diff --git a/software/hmmer/cygwin1.dll b/software/hmmer/cygwin1.dll deleted file mode 100644 index 350a74ba..00000000 Binary files a/software/hmmer/cygwin1.dll and /dev/null differ diff --git a/software/hmmer/hmmbuild b/software/hmmer/hmmbuild index a6f81a95..e759997c 100755 Binary files a/software/hmmer/hmmbuild and b/software/hmmer/hmmbuild differ diff --git a/software/hmmer/hmmbuild.exe b/software/hmmer/hmmbuild.exe deleted file mode 100644 index 950a8394..00000000 Binary files a/software/hmmer/hmmbuild.exe and /dev/null differ diff --git a/software/hmmer/hmmbuild.mac b/software/hmmer/hmmbuild.mac index 1f457757..57fe27e7 100755 Binary files a/software/hmmer/hmmbuild.mac and b/software/hmmer/hmmbuild.mac differ diff --git a/software/hmmer/hmmsearch b/software/hmmer/hmmsearch index ab090fdc..34bc2aca 100755 Binary files a/software/hmmer/hmmsearch and b/software/hmmer/hmmsearch differ diff --git a/software/hmmer/hmmsearch.exe b/software/hmmer/hmmsearch.exe deleted file mode 100644 index 66856d57..00000000 Binary files a/software/hmmer/hmmsearch.exe and /dev/null differ diff --git a/software/hmmer/hmmsearch.mac b/software/hmmer/hmmsearch.mac index 0f393cfd..342f6d26 100755 Binary files a/software/hmmer/hmmsearch.mac and b/software/hmmer/hmmsearch.mac differ diff --git a/software/m2html/private/m2htmltoolbarimages.mat b/software/m2html/private/m2htmltoolbarimages.mat index 2fd358ee..19e9f16b 100644 Binary files a/software/m2html/private/m2htmltoolbarimages.mat and b/software/m2html/private/m2htmltoolbarimages.mat differ diff --git a/software/mafft/mafft-linux64/mafftdir/bin/mafft b/software/mafft/mafft-linux64/mafftdir/bin/mafft index cc0af8e6..defcc4e9 100755 --- a/software/mafft/mafft-linux64/mafftdir/bin/mafft +++ b/software/mafft/mafft-linux64/mafftdir/bin/mafft @@ -1,7 +1,7 @@ #! /bin/bash er=0; myself=`dirname "$0"`/`basename "$0"`; export myself -version="v7.490 (2021/Oct/30)"; export version +version="v7.526 (2024/Apr/26)"; export version LANG=C; export LANG os=`uname` progname=`basename "$0"` @@ -175,6 +175,7 @@ elif [ $progname = "nwnsi" -o $progname = "mafft-nwnsi" ]; then fi outputfile="" namelength=-1 +linelength=60 # Will change to -1 in the future anysymbol=0 parallelizationstrategy="BAATARI2" kappa=$defaultkappa @@ -216,7 +217,9 @@ contrafold=$defaultcontrafold progressfile="/dev/stderr" anchorfile="/dev/null" anchoropt="" -maxanchorseparation=1000 +#maxanchorseparation=1000 +#maxanchorseparation=-1 # 2023/Jan/11 +terminalmargin=100 # 2024/Mar debug=0 sw=0 algopt=$defaultalgopt @@ -241,6 +244,7 @@ partorderopt=" -x " treeout=0 nodeout=0 distout=0 +distformat="hat2" treein=0 topin=0 treeinopt=" " @@ -252,12 +256,15 @@ strdir="$PWD" scorematrix="/dev/null" textmatrix="/dev/null" treeinfile="/dev/null" +codonposfile="/dev/null" +codonscorefile="/dev/null" rnascoremtx=" " laraparams="/dev/null" foldalignopt=" " treealg=" -X 0.1 " sueff="1.0" maxambiguous="1.0" +dofilter=0 scoreoutarg=" " numthreads=0 numthreadsit=-1 @@ -352,6 +359,13 @@ if [ $# -gt 0 ]; then echo "" 1>&2 exit 1; # algspecified=1 + elif [ "$1" = "--linelength" ]; then + shift + linelength=`expr "$1" - 0` + if [ $linelength -eq 0 ]; then + echo "Line length = 0 ?" 1>&2 + exit + fi elif [ "$1" = "--namelength" ]; then shift namelength=`expr "$1" - 0` @@ -389,6 +403,10 @@ if [ $# -gt 0 ]; then treeout=1 elif [ "$1" = "--distout" ]; then distout=1 + distformat="hat2" + elif [ "$1" = "--distoutclustalo" ]; then + distout=1 + distformat="clodist" elif [ "$1" = "--fastswpair" ]; then distance="fasta" pairspecified=1 @@ -410,6 +428,25 @@ if [ $# -gt 0 ]; then elif [ "$1" = "--maxambiguous" ]; then shift maxambiguous="$1" + dofilter=1 + elif [ "$1" = "--codonpos" ]; then + shift + codonposfile="$1" + if [ ! -e "$codonposfile" ]; then + echo "Cannot open $codonposfile" 1>&2 + echo "" 1>&2 + exit + fi + codonposopt=" -R " + elif [ "$1" = "--codonscore" ]; then + shift + codonscorefile="$1" + if [ ! -e "$codonscorefile" ]; then + echo "Cannot open $codonscorefile" 1>&2 + echo "" 1>&2 + exit + fi + codonscoreopt=" -S " elif [ "$1" = "--noscore" ]; then scorecalcopt=" -Z " elif [ "$1" = "--6mermultipair" ]; then @@ -544,13 +581,13 @@ if [ $# -gt 0 ]; then elif [ "$1" = "--out" ]; then shift outputfile="$1" - elif [ "$1" = "--skipanchorsremoterthan" ]; then + elif [ "$1" = "--terminalmargin" ]; then shift if ! expr "$1" : "[0-9]" > /dev/null ; then - echo "Specify maximum gap length between anchors." 1>&2 + echo "set --terminalmargin (int)." 1>&2 exit fi - maxanchorseparation=`expr "$1" - 0` + terminalmargin=`expr "$1" - 0` elif [ "$1" = "--anchors" ]; then shift anchorfile="$1" @@ -645,10 +682,26 @@ if [ $# -gt 0 ]; then addarg0="-K -I" addfile="$1" fragment=-2 + elif [ "$1" = "--addtotop" ]; then + shift + addarg0="-K -I" + addfile="$1" + fragment=-3 + elif [ "$1" = "--addtoroot" ]; then + shift + addarg0="-K -I" + addfile="$1" + fragment=-4 elif [ "$1" = "--smoothing" ]; then add2ndhalfarg=$add2ndhalfarg" -p " elif [ "$1" = "--keeplength" ]; then add2ndhalfarg=$add2ndhalfarg" -Y " + elif [ "$1" = "--compactmapout" ]; then + add2ndhalfarg=$add2ndhalfarg" -z -Y " + elif [ "$1" = "--compactmapoutfile" ]; then + shift + add2ndhalfarg=$add2ndhalfarg" -z -Y " + mapoutfile="$1" elif [ "$1" = "--mapout" ]; then add2ndhalfarg=$add2ndhalfarg" -Z -Y " elif [ "$1" = "--mapoutfile" ]; then @@ -684,7 +737,8 @@ if [ $# -gt 0 ]; then f2clext="-E" seqtype="-P" fft=0 - sbstmodel=" -b -2 -a " +# sbstmodel=" -b -2 -a " + sbstmodel=" -b -2 " # 2022/Jul, hauretsu no kawari ni scoremtx wo miru scorematrix="$1" if [ ! -e "$scorematrix" ]; then echo "Cannot open $scorematrix" 1>&2 @@ -1079,9 +1133,9 @@ function removetmpfile() { # for MPI echo "" >> "$TMPFILE/infile" cat "$addfile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_addfile" - if [ $maxambiguous != "1.0" ]; then - mv "$TMPFILE/infile" "$TMPFILE/_tofilter" - "$prefix/filter" -m $maxambiguous $seqtype -i "$TMPFILE/_tofilter" > "$TMPFILE/infile" 2>>"$progressfile" || exit 1 + if [ $dofilter -eq 1 ]; then +# mv "$TMPFILE/infile" "$TMPFILE/_tofilter" +# "$prefix/filter" -m $maxambiguous $seqtype -i "$TMPFILE/_tofilter" > "$TMPFILE/infile" 2>>"$progressfile" || exit 1 mv "$TMPFILE/_addfile" "$TMPFILE/_tofilter" "$prefix/filter" -m $maxambiguous $seqtype -i "$TMPFILE/_tofilter" > "$TMPFILE/_addfile" 2>>"$progressfile" || exit 1 fi @@ -1090,6 +1144,8 @@ function removetmpfile() { # for MPI cat "$scorematrix" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_aamtx" cat "$mergetable" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_subalignmentstable" cat "$treeinfile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_guidetree" + cat "$codonposfile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_codonpos" + cat "$codonscorefile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_codonscore" cat "$seedtablefile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_seedtablefile" cat "$laraparams" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_lara.params" cat "$pdblist" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/pdblist" @@ -1104,6 +1160,8 @@ $addfile $scorematrix $mergetable $treeinfile +$codonposfile +$codonscorefile $seedtablefile $laraparams $pdblist @@ -1335,10 +1393,11 @@ $ownlist" exit 1; fi # nagasa check! -# - if [ $npair -gt 10000000 -o $nlen -gt 5000 ]; then # 2017/Oct +# if [ $npair -gt 10000000 -o $nlen -gt 5000 ]; then # 2017/Oct + if [ $npair -gt 10000000 -o $nlen -gt 5000 -o $nadd -gt 500000 ]; then # 2021/Dec pairlocalalign to buntan distance="ktuples" echo "use ktuples, size=$tuplesize!" 1>>"$progressfile" +# elif [ $npair -gt 3000000 -o $nlen -gt 5000 ]; then # 2017/Oct elif [ $npair -gt 3000000 -o $nlen -gt 5000 ]; then # 2017/Oct distance="multi" weighti="0.0" @@ -1739,7 +1798,7 @@ $ownlist" parttreeoutopt=" " fi if [ $distout -eq 1 ]; then - distoutopt="-y -T" + distoutopt="-y ${distformat} -T" if [ $treeout -eq 0 ]; then treeoutopt="" fi @@ -1760,7 +1819,7 @@ $ownlist" treeoutopt=" " fi if [ $distout -eq 1 ]; then - distoutopt="-y" + distoutopt="-y ${distformat}" fi fi # @@ -2000,6 +2059,16 @@ $ownlist" laof=0.0 # 2015Jun01 lexp=0.0 # 2015Jun01 iterate=0 + elif [ $fragment -eq "-3" ]; then + addarg="$addarg0 $nadd" + addsinglearg="-x" # add to top, 2021/12/31 + cycle=1 # chuui 2014Aug25 + iterate=0 + elif [ $fragment -eq "-4" ]; then + addarg="$addarg0 $nadd" + addsinglearg="-s" # add to root, 2023/2/11 + cycle=1 # chuui 2014Aug25 + iterate=0 else addarg="$addarg0 $nadd" addsinglearg="" @@ -2032,6 +2101,21 @@ $ownlist" fi fi + if [ "$codonposfile" != "/dev/null" -o "$codonscorefile" != "/dev/null" ]; then + if [ $nadd -eq "0" -o $fragment -eq "0" ]; then + echo '' 1>>"$progressfile" + echo "'--codonpos' and '--codonscore' options are supported only with the '--6merpair --addfragments' option." 1>>"$progressfile" + echo '' 1>>"$progressfile" + exit 1 + fi + if [ $distance != "ktuples" ]; then # ato de taiou + echo '' 1>>"$progressfile" + echo "'--codonpos' and '--codonscore' options are supported only with the '--6merpair --addfragments' option, at this point." 1>>"$progressfile" + echo '' 1>>"$progressfile" + exit 1 + fi + fi + if [ -z "$localparam" -a $fragment -eq 0 -a $distance != "parttree" ]; then # echo "use disttbfast" @@ -2143,6 +2227,8 @@ $ownlist" strategy=$strategy"full" elif [ $fragment -eq -2 ]; then strategy=$strategy"long" + elif [ $fragment -eq -3 ]; then + strategy=$strategy"top" else strategy=$strategy$cycle fi @@ -2194,9 +2280,9 @@ $ownlist" elif [ $outputformat = "phylip" -a $outorder = "input" ]; then outputopt=" -y " elif [ $outputformat = "pir" -a $outorder = "aligned" ]; then - outputopt=" -f -r $TMPFILE/order " + outputopt=" -f -l $linelength -r $TMPFILE/order " else - outputopt="-f" + outputopt="-f -l $linelength" fi if [ $newdash_originalsequenceonly -eq 1 ]; then @@ -2502,7 +2588,7 @@ $ownlist" "$prefix/pairlocalalign" $localparam $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -L $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hatx hat3 - "$prefix/addsingle" -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else "$prefix/tbfast" _ -u $unalignlevel $localparam -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -L $usenaivepairscore $focusarg _ -+ $iterate -W $minimumweight -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreadstb $rnaopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg $focusarg < infile > /dev/null 2>>"$progressfile" || exit 1 fi @@ -2518,7 +2604,7 @@ $ownlist" "$prefix/pairlocalalign" $addarg -C $numthreads $seqtype $model -e $last_e -w $last_m -g $lexp -f $lgop -Q $spfactor -h $laof -R $last_subopt $last_once -d "$prefix" < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hatx hat3 - "$prefix/addsingle" -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else "$prefix/pairlocalalign" -C $numthreads $seqtype $model -e $last_e -w $last_m -g $lexp -f $lgop -Q $spfactor -h $laof -R $last_subopt $last_once -d "$prefix" < infile > /dev/null 2>>"$progressfile" || exit 1 # addarg wo watasanai @@ -2527,27 +2613,27 @@ $ownlist" "$prefix/tbfast" -W $minimumweight -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreadstb $rnaopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 fi elif [ $distance = "lastmulti" ]; then - "$prefix/dndpre" $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/dndpre" -y $distformat $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 mv hat2 hat2i "$prefix/pairlocalalign" $addarg -C $numthreads $seqtype $model -e $last_e -w $last_m -g $lexp -f $lgop -Q $spfactor -h $laof -r $last_subopt $last_once -d "$prefix" < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hat2 hat2n mv hatx hat3 if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else echo "Impossible" 1>&2 exit 1 fi elif [ $distance = "multi" ]; then - "$prefix/dndpre" $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/dndpre" -y $distformat $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 mv hat2 hat2i "$prefix/pairlocalalign" $localparam $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -Y $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hat2 hat2n mv hatx hat3 if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else echo "Impossible" 1>&2 exit 1 @@ -2558,7 +2644,7 @@ $ownlist" mv hatx hat3 "$prefix/disttbfast" -E 1 -s $unalignlevel $legacygapopt -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt -T -y $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else "$prefix/tbfast" -W $minimumweight -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreadstb $rnaopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 fi @@ -2569,22 +2655,22 @@ $ownlist" "$prefix/splittbfast" $legacygapopt $algopt $splitopt $partorderopt $parttreeoutopt $memopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof -p $partsize -s $groupsize $treealg $outnum -i infile > pre 2>>"$progressfile" || exit 1 mv hat3.seed hat3 elif [ $distance = "ktuplesmulti" ]; then -# "$prefix/dndpre" $model -M 1 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 +# "$prefix/dndpre" -y $distformat $model -M 1 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 # mv hat2 hat2i # "$prefix/disttbfast" -E 1 -s $unalignlevel $legacygapopt -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt -T -y $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 # mv hat2 hat2n if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -d -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 -# "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -d -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 +# "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else echo "Impossible" 1>&2 exit 1 fi else if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else - "$prefix/disttbfast" -q $npickup -E $cycledisttbfast -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -g $gexp -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg $anchoropt -x $maxanchorseparation $oneiterationopt < infile > pre 2>>"$progressfile" || exit 1 + "$prefix/disttbfast" -q $npickup -E $cycledisttbfast -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -g $gexp -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg $anchoropt -x $terminalmargin $oneiterationopt < infile > pre 2>>"$progressfile" || exit 1 mv hat3.seed hat3 fi fi @@ -2602,7 +2688,7 @@ $ownlist" done if [ $iterate -gt 0 ]; then if [ $distance = "ktuples" ]; then - "$prefix/dndpre" $seqtype $model -M 2 -C $numthreads < pre > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/dndpre" -y $distformat $seqtype $model -M 2 -C $numthreads < pre > /dev/null 2>>"$progressfile" || exit 1 fi "$prefix/dvtditr" -W $minimumweight $bunkatsuopt -E $fixthreshold -s $unalignlevel $legacygapopt $mergearg $outnum -C $numthreadsit -t $randomseed $rnaoptit $memopt $scorecalcopt $localparam -z 50 $seqtype $model -f "-"$gop -Q $spfactor -h $aof -I $iterate $weightopt $treeinopt $algoptit $treealg -p $parallelizationstrategy $scoreoutarg -K $nadd < pre > /dev/null 2>>"$progressfile" || exit 1 fi @@ -2711,7 +2797,7 @@ $ownlist" popd > /dev/null - if [ "$outputopt" != "-f" -o "$windows" = "yes" ]; then # Windows deha kaigyo code wo f2cl de modosu. + if [ "$outputopt" != "-f -l -1" -o "$windows" = "yes" ]; then # Windows deha kaigyo code wo f2cl de modosu. # ln -s "$TMPFILE/order" _order$$ # f2cl ga space ari filename ni taiou shiteinainode # cp "$TMPFILE/order" _order$$ # ln -s no error wo sakeru if [ "$outputfile" = "" ]; then @@ -2737,7 +2823,7 @@ $ownlist" fi if [ $distout -eq 1 ]; then - cp "$TMPFILE/hat2" "$infilename.hat2" + cp "$TMPFILE/${distformat}" "$infilename.${distformat}" fi if [ $npickup -ne 0 ]; then diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/addsingle b/software/mafft/mafft-linux64/mafftdir/libexec/addsingle index 11a61d6d..5aa4e300 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/addsingle and b/software/mafft/mafft-linux64/mafftdir/libexec/addsingle differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/contrafoldwrap b/software/mafft/mafft-linux64/mafftdir/libexec/contrafoldwrap index 0e1beac9..b67965f0 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/contrafoldwrap and b/software/mafft/mafft-linux64/mafftdir/libexec/contrafoldwrap differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/countlen b/software/mafft/mafft-linux64/mafftdir/libexec/countlen index 0efbbab1..829d67b5 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/countlen and b/software/mafft/mafft-linux64/mafftdir/libexec/countlen differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/disttbfast b/software/mafft/mafft-linux64/mafftdir/libexec/disttbfast index 4d950cf7..0a0699dd 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/disttbfast and b/software/mafft/mafft-linux64/mafftdir/libexec/disttbfast differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/dndblast b/software/mafft/mafft-linux64/mafftdir/libexec/dndblast index b4b112d0..636ea9fb 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/dndblast and b/software/mafft/mafft-linux64/mafftdir/libexec/dndblast differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/dndfast7 b/software/mafft/mafft-linux64/mafftdir/libexec/dndfast7 index 0ee51697..3a064967 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/dndfast7 and b/software/mafft/mafft-linux64/mafftdir/libexec/dndfast7 differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/dndpre b/software/mafft/mafft-linux64/mafftdir/libexec/dndpre index 4b27abed..49c249e1 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/dndpre and b/software/mafft/mafft-linux64/mafftdir/libexec/dndpre differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/dvtditr b/software/mafft/mafft-linux64/mafftdir/libexec/dvtditr index 115b2ec7..ebb46dd7 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/dvtditr and b/software/mafft/mafft-linux64/mafftdir/libexec/dvtditr differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/f2cl b/software/mafft/mafft-linux64/mafftdir/libexec/f2cl index d740559a..fd685023 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/f2cl and b/software/mafft/mafft-linux64/mafftdir/libexec/f2cl differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/filter b/software/mafft/mafft-linux64/mafftdir/libexec/filter index f0ad2ead..35d6e241 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/filter and b/software/mafft/mafft-linux64/mafftdir/libexec/filter differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/getlag b/software/mafft/mafft-linux64/mafftdir/libexec/getlag index 7c4ea918..6c57fff0 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/getlag and b/software/mafft/mafft-linux64/mafftdir/libexec/getlag differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/mafft-distance b/software/mafft/mafft-linux64/mafftdir/libexec/mafft-distance index be7f77a2..cd02da5b 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/mafft-distance and b/software/mafft/mafft-linux64/mafftdir/libexec/mafft-distance differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/mafft-profile b/software/mafft/mafft-linux64/mafftdir/libexec/mafft-profile index 73936551..f4bf0db1 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/mafft-profile and b/software/mafft/mafft-linux64/mafftdir/libexec/mafft-profile differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/makedirectionlist b/software/mafft/mafft-linux64/mafftdir/libexec/makedirectionlist index 89f3ccb3..1f1cae6f 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/makedirectionlist and b/software/mafft/mafft-linux64/mafftdir/libexec/makedirectionlist differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/mccaskillwrap b/software/mafft/mafft-linux64/mafftdir/libexec/mccaskillwrap index 85bd1665..7e8d4613 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/mccaskillwrap and b/software/mafft/mafft-linux64/mafftdir/libexec/mccaskillwrap differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/multi2hat3s b/software/mafft/mafft-linux64/mafftdir/libexec/multi2hat3s index 4b06fcf4..a7154fef 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/multi2hat3s and b/software/mafft/mafft-linux64/mafftdir/libexec/multi2hat3s differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/nodepair b/software/mafft/mafft-linux64/mafftdir/libexec/nodepair index 581fefea..fba0428c 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/nodepair and b/software/mafft/mafft-linux64/mafftdir/libexec/nodepair differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/pairash b/software/mafft/mafft-linux64/mafftdir/libexec/pairash index 307a0534..ed40c2fc 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/pairash and b/software/mafft/mafft-linux64/mafftdir/libexec/pairash differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/pairlocalalign b/software/mafft/mafft-linux64/mafftdir/libexec/pairlocalalign index 475cba76..6858df36 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/pairlocalalign and b/software/mafft/mafft-linux64/mafftdir/libexec/pairlocalalign differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/regtable2seq b/software/mafft/mafft-linux64/mafftdir/libexec/regtable2seq index 45e90d17..9c364cf4 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/regtable2seq and b/software/mafft/mafft-linux64/mafftdir/libexec/regtable2seq differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/replaceu b/software/mafft/mafft-linux64/mafftdir/libexec/replaceu index fcd6ab7e..97a933eb 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/replaceu and b/software/mafft/mafft-linux64/mafftdir/libexec/replaceu differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/restoreu b/software/mafft/mafft-linux64/mafftdir/libexec/restoreu index ff6bbb5f..6144e53a 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/restoreu and b/software/mafft/mafft-linux64/mafftdir/libexec/restoreu differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/score b/software/mafft/mafft-linux64/mafftdir/libexec/score index c815992f..1f270504 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/score and b/software/mafft/mafft-linux64/mafftdir/libexec/score differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/seq2regtable b/software/mafft/mafft-linux64/mafftdir/libexec/seq2regtable index 4b2b440e..990929e1 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/seq2regtable and b/software/mafft/mafft-linux64/mafftdir/libexec/seq2regtable differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/setcore b/software/mafft/mafft-linux64/mafftdir/libexec/setcore index 30472e54..4c886d20 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/setcore and b/software/mafft/mafft-linux64/mafftdir/libexec/setcore differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/setdirection b/software/mafft/mafft-linux64/mafftdir/libexec/setdirection index 95b15080..b9981ff0 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/setdirection and b/software/mafft/mafft-linux64/mafftdir/libexec/setdirection differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/sextet5 b/software/mafft/mafft-linux64/mafftdir/libexec/sextet5 index 0df7bbc5..e9fc9242 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/sextet5 and b/software/mafft/mafft-linux64/mafftdir/libexec/sextet5 differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/splittbfast b/software/mafft/mafft-linux64/mafftdir/libexec/splittbfast index c35e2881..f49ea46e 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/splittbfast and b/software/mafft/mafft-linux64/mafftdir/libexec/splittbfast differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/tbfast b/software/mafft/mafft-linux64/mafftdir/libexec/tbfast index 1ab0e605..090e35db 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/tbfast and b/software/mafft/mafft-linux64/mafftdir/libexec/tbfast differ diff --git a/software/mafft/mafft-linux64/mafftdir/libexec/version b/software/mafft/mafft-linux64/mafftdir/libexec/version index e30b3a86..91b987cb 100755 Binary files a/software/mafft/mafft-linux64/mafftdir/libexec/version and b/software/mafft/mafft-linux64/mafftdir/libexec/version differ diff --git a/software/mafft/mafft-mac/mafftdir/bin/mafft b/software/mafft/mafft-mac/mafftdir/bin/mafft index 91b5fc97..ea696b6b 100755 --- a/software/mafft/mafft-mac/mafftdir/bin/mafft +++ b/software/mafft/mafft-mac/mafftdir/bin/mafft @@ -1,7 +1,7 @@ #! /bin/bash er=0; myself=`dirname "$0"`/`basename "$0"`; export myself -version="v7.490 (2021/Oct/30)"; export version +version="v7.526 (2024/Apr/26)"; export version LANG=C; export LANG os=`uname` progname=`basename "$0"` @@ -175,6 +175,7 @@ elif [ $progname = "nwnsi" -o $progname = "mafft-nwnsi" ]; then fi outputfile="" namelength=-1 +linelength=60 # Will change to -1 in the future anysymbol=0 parallelizationstrategy="BAATARI2" kappa=$defaultkappa @@ -216,7 +217,9 @@ contrafold=$defaultcontrafold progressfile="/dev/stderr" anchorfile="/dev/null" anchoropt="" -maxanchorseparation=1000 +#maxanchorseparation=1000 +#maxanchorseparation=-1 # 2023/Jan/11 +terminalmargin=100 # 2024/Mar debug=0 sw=0 algopt=$defaultalgopt @@ -241,6 +244,7 @@ partorderopt=" -x " treeout=0 nodeout=0 distout=0 +distformat="hat2" treein=0 topin=0 treeinopt=" " @@ -252,12 +256,15 @@ strdir="$PWD" scorematrix="/dev/null" textmatrix="/dev/null" treeinfile="/dev/null" +codonposfile="/dev/null" +codonscorefile="/dev/null" rnascoremtx=" " laraparams="/dev/null" foldalignopt=" " treealg=" -X 0.1 " sueff="1.0" maxambiguous="1.0" +dofilter=0 scoreoutarg=" " numthreads=0 numthreadsit=-1 @@ -352,6 +359,13 @@ if [ $# -gt 0 ]; then echo "" 1>&2 exit 1; # algspecified=1 + elif [ "$1" = "--linelength" ]; then + shift + linelength=`expr "$1" - 0` + if [ $linelength -eq 0 ]; then + echo "Line length = 0 ?" 1>&2 + exit + fi elif [ "$1" = "--namelength" ]; then shift namelength=`expr "$1" - 0` @@ -389,6 +403,10 @@ if [ $# -gt 0 ]; then treeout=1 elif [ "$1" = "--distout" ]; then distout=1 + distformat="hat2" + elif [ "$1" = "--distoutclustalo" ]; then + distout=1 + distformat="clodist" elif [ "$1" = "--fastswpair" ]; then distance="fasta" pairspecified=1 @@ -410,6 +428,25 @@ if [ $# -gt 0 ]; then elif [ "$1" = "--maxambiguous" ]; then shift maxambiguous="$1" + dofilter=1 + elif [ "$1" = "--codonpos" ]; then + shift + codonposfile="$1" + if [ ! -e "$codonposfile" ]; then + echo "Cannot open $codonposfile" 1>&2 + echo "" 1>&2 + exit + fi + codonposopt=" -R " + elif [ "$1" = "--codonscore" ]; then + shift + codonscorefile="$1" + if [ ! -e "$codonscorefile" ]; then + echo "Cannot open $codonscorefile" 1>&2 + echo "" 1>&2 + exit + fi + codonscoreopt=" -S " elif [ "$1" = "--noscore" ]; then scorecalcopt=" -Z " elif [ "$1" = "--6mermultipair" ]; then @@ -544,13 +581,13 @@ if [ $# -gt 0 ]; then elif [ "$1" = "--out" ]; then shift outputfile="$1" - elif [ "$1" = "--skipanchorsremoterthan" ]; then + elif [ "$1" = "--terminalmargin" ]; then shift if ! expr "$1" : "[0-9]" > /dev/null ; then - echo "Specify maximum gap length between anchors." 1>&2 + echo "set --terminalmargin (int)." 1>&2 exit fi - maxanchorseparation=`expr "$1" - 0` + terminalmargin=`expr "$1" - 0` elif [ "$1" = "--anchors" ]; then shift anchorfile="$1" @@ -645,10 +682,26 @@ if [ $# -gt 0 ]; then addarg0="-K -I" addfile="$1" fragment=-2 + elif [ "$1" = "--addtotop" ]; then + shift + addarg0="-K -I" + addfile="$1" + fragment=-3 + elif [ "$1" = "--addtoroot" ]; then + shift + addarg0="-K -I" + addfile="$1" + fragment=-4 elif [ "$1" = "--smoothing" ]; then add2ndhalfarg=$add2ndhalfarg" -p " elif [ "$1" = "--keeplength" ]; then add2ndhalfarg=$add2ndhalfarg" -Y " + elif [ "$1" = "--compactmapout" ]; then + add2ndhalfarg=$add2ndhalfarg" -z -Y " + elif [ "$1" = "--compactmapoutfile" ]; then + shift + add2ndhalfarg=$add2ndhalfarg" -z -Y " + mapoutfile="$1" elif [ "$1" = "--mapout" ]; then add2ndhalfarg=$add2ndhalfarg" -Z -Y " elif [ "$1" = "--mapoutfile" ]; then @@ -684,7 +737,8 @@ if [ $# -gt 0 ]; then f2clext="-E" seqtype="-P" fft=0 - sbstmodel=" -b -2 -a " +# sbstmodel=" -b -2 -a " + sbstmodel=" -b -2 " # 2022/Jul, hauretsu no kawari ni scoremtx wo miru scorematrix="$1" if [ ! -e "$scorematrix" ]; then echo "Cannot open $scorematrix" 1>&2 @@ -1079,9 +1133,9 @@ function removetmpfile() { # for MPI echo "" >> "$TMPFILE/infile" cat "$addfile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_addfile" - if [ $maxambiguous != "1.0" ]; then - mv "$TMPFILE/infile" "$TMPFILE/_tofilter" - "$prefix/filter" -m $maxambiguous $seqtype -i "$TMPFILE/_tofilter" > "$TMPFILE/infile" 2>>"$progressfile" || exit 1 + if [ $dofilter -eq 1 ]; then +# mv "$TMPFILE/infile" "$TMPFILE/_tofilter" +# "$prefix/filter" -m $maxambiguous $seqtype -i "$TMPFILE/_tofilter" > "$TMPFILE/infile" 2>>"$progressfile" || exit 1 mv "$TMPFILE/_addfile" "$TMPFILE/_tofilter" "$prefix/filter" -m $maxambiguous $seqtype -i "$TMPFILE/_tofilter" > "$TMPFILE/_addfile" 2>>"$progressfile" || exit 1 fi @@ -1090,6 +1144,8 @@ function removetmpfile() { # for MPI cat "$scorematrix" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_aamtx" cat "$mergetable" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_subalignmentstable" cat "$treeinfile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_guidetree" + cat "$codonposfile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_codonpos" + cat "$codonscorefile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_codonscore" cat "$seedtablefile" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_seedtablefile" cat "$laraparams" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/_lara.params" cat "$pdblist" | tr "\r" "\n" | grep -v "^$" > "$TMPFILE/pdblist" @@ -1104,6 +1160,8 @@ $addfile $scorematrix $mergetable $treeinfile +$codonposfile +$codonscorefile $seedtablefile $laraparams $pdblist @@ -1335,10 +1393,11 @@ $ownlist" exit 1; fi # nagasa check! -# - if [ $npair -gt 10000000 -o $nlen -gt 5000 ]; then # 2017/Oct +# if [ $npair -gt 10000000 -o $nlen -gt 5000 ]; then # 2017/Oct + if [ $npair -gt 10000000 -o $nlen -gt 5000 -o $nadd -gt 500000 ]; then # 2021/Dec pairlocalalign to buntan distance="ktuples" echo "use ktuples, size=$tuplesize!" 1>>"$progressfile" +# elif [ $npair -gt 3000000 -o $nlen -gt 5000 ]; then # 2017/Oct elif [ $npair -gt 3000000 -o $nlen -gt 5000 ]; then # 2017/Oct distance="multi" weighti="0.0" @@ -1739,7 +1798,7 @@ $ownlist" parttreeoutopt=" " fi if [ $distout -eq 1 ]; then - distoutopt="-y -T" + distoutopt="-y ${distformat} -T" if [ $treeout -eq 0 ]; then treeoutopt="" fi @@ -1760,7 +1819,7 @@ $ownlist" treeoutopt=" " fi if [ $distout -eq 1 ]; then - distoutopt="-y" + distoutopt="-y ${distformat}" fi fi # @@ -2000,6 +2059,16 @@ $ownlist" laof=0.0 # 2015Jun01 lexp=0.0 # 2015Jun01 iterate=0 + elif [ $fragment -eq "-3" ]; then + addarg="$addarg0 $nadd" + addsinglearg="-x" # add to top, 2021/12/31 + cycle=1 # chuui 2014Aug25 + iterate=0 + elif [ $fragment -eq "-4" ]; then + addarg="$addarg0 $nadd" + addsinglearg="-s" # add to root, 2023/2/11 + cycle=1 # chuui 2014Aug25 + iterate=0 else addarg="$addarg0 $nadd" addsinglearg="" @@ -2032,6 +2101,21 @@ $ownlist" fi fi + if [ "$codonposfile" != "/dev/null" -o "$codonscorefile" != "/dev/null" ]; then + if [ $nadd -eq "0" -o $fragment -eq "0" ]; then + echo '' 1>>"$progressfile" + echo "'--codonpos' and '--codonscore' options are supported only with the '--6merpair --addfragments' option." 1>>"$progressfile" + echo '' 1>>"$progressfile" + exit 1 + fi + if [ $distance != "ktuples" ]; then # ato de taiou + echo '' 1>>"$progressfile" + echo "'--codonpos' and '--codonscore' options are supported only with the '--6merpair --addfragments' option, at this point." 1>>"$progressfile" + echo '' 1>>"$progressfile" + exit 1 + fi + fi + if [ -z "$localparam" -a $fragment -eq 0 -a $distance != "parttree" ]; then # echo "use disttbfast" @@ -2143,6 +2227,8 @@ $ownlist" strategy=$strategy"full" elif [ $fragment -eq -2 ]; then strategy=$strategy"long" + elif [ $fragment -eq -3 ]; then + strategy=$strategy"top" else strategy=$strategy$cycle fi @@ -2194,9 +2280,9 @@ $ownlist" elif [ $outputformat = "phylip" -a $outorder = "input" ]; then outputopt=" -y " elif [ $outputformat = "pir" -a $outorder = "aligned" ]; then - outputopt=" -f -r $TMPFILE/order " + outputopt=" -f -l $linelength -r $TMPFILE/order " else - outputopt="-f" + outputopt="-f -l $linelength" fi if [ $newdash_originalsequenceonly -eq 1 ]; then @@ -2502,7 +2588,7 @@ $ownlist" "$prefix/pairlocalalign" $localparam $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -L $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hatx hat3 - "$prefix/addsingle" -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else "$prefix/tbfast" _ -u $unalignlevel $localparam -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -L $usenaivepairscore $focusarg _ -+ $iterate -W $minimumweight -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreadstb $rnaopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg $focusarg < infile > /dev/null 2>>"$progressfile" || exit 1 fi @@ -2518,7 +2604,7 @@ $ownlist" "$prefix/pairlocalalign" $addarg -C $numthreads $seqtype $model -e $last_e -w $last_m -g $lexp -f $lgop -Q $spfactor -h $laof -R $last_subopt $last_once -d "$prefix" < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hatx hat3 - "$prefix/addsingle" -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else "$prefix/pairlocalalign" -C $numthreads $seqtype $model -e $last_e -w $last_m -g $lexp -f $lgop -Q $spfactor -h $laof -R $last_subopt $last_once -d "$prefix" < infile > /dev/null 2>>"$progressfile" || exit 1 # addarg wo watasanai @@ -2527,27 +2613,27 @@ $ownlist" "$prefix/tbfast" -W $minimumweight -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreadstb $rnaopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 fi elif [ $distance = "lastmulti" ]; then - "$prefix/dndpre" $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/dndpre" -y $distformat $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 mv hat2 hat2i "$prefix/pairlocalalign" $addarg -C $numthreads $seqtype $model -e $last_e -w $last_m -g $lexp -f $lgop -Q $spfactor -h $laof -r $last_subopt $last_once -d "$prefix" < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hat2 hat2n mv hatx hat3 if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else echo "Impossible" 1>&2 exit 1 fi elif [ $distance = "multi" ]; then - "$prefix/dndpre" $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/dndpre" -y $distformat $model -M 2 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 mv hat2 hat2i "$prefix/pairlocalalign" $localparam $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -Q $spfactor -h $laof -Y $usenaivepairscore < infile > /dev/null 2>>"$progressfile" || exit 1 cat hat3.seed hat3 > hatx mv hat2 hat2n mv hatx hat3 if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else echo "Impossible" 1>&2 exit 1 @@ -2558,7 +2644,7 @@ $ownlist" mv hatx hat3 "$prefix/disttbfast" -E 1 -s $unalignlevel $legacygapopt -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt -T -y $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else "$prefix/tbfast" -W $minimumweight -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreadstb $rnaopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 fi @@ -2569,22 +2655,22 @@ $ownlist" "$prefix/splittbfast" $legacygapopt $algopt $splitopt $partorderopt $parttreeoutopt $memopt $seqtype $model -f "-"$gop -Q $spfactor -h $aof -p $partsize -s $groupsize $treealg $outnum -i infile > pre 2>>"$progressfile" || exit 1 mv hat3.seed hat3 elif [ $distance = "ktuplesmulti" ]; then -# "$prefix/dndpre" $model -M 1 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 +# "$prefix/dndpre" -y $distformat $model -M 1 $addarg -C $numthreads $seqtype $model -g $lexp -f $lgop -h $laof < infile > /dev/null 2>>"$progressfile" || exit 1 # mv hat2 hat2i # "$prefix/disttbfast" -E 1 -s $unalignlevel $legacygapopt -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt -T -y $seqtype $model -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 # mv hat2 hat2n if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -d -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 -# "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -d -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 +# "$prefix/addsingle" -Q 100 $legacygapopt -d -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else echo "Impossible" 1>&2 exit 1 fi else if [ $fragment -ne 0 ]; then - "$prefix/addsingle" -Q 100 $legacygapopt -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg $scoreoutarg < infile > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/addsingle" $codonposopt $codonscoreopt -Q 100 $legacygapopt -W $tuplesize -O $outnum $addsinglearg $addarg $add2ndhalfarg -C $numthreads $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -f "-"$gop -h $aof $param_fft $localparam $algopt $treealg < infile > /dev/null 2>>"$progressfile" || exit 1 else - "$prefix/disttbfast" -q $npickup -E $cycledisttbfast -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -g $gexp -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg $anchoropt -x $maxanchorseparation $oneiterationopt < infile > pre 2>>"$progressfile" || exit 1 + "$prefix/disttbfast" -q $npickup -E $cycledisttbfast -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -g $gexp -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg $anchoropt -x $terminalmargin $oneiterationopt < infile > pre 2>>"$progressfile" || exit 1 mv hat3.seed hat3 fi fi @@ -2602,7 +2688,7 @@ $ownlist" done if [ $iterate -gt 0 ]; then if [ $distance = "ktuples" ]; then - "$prefix/dndpre" $seqtype $model -M 2 -C $numthreads < pre > /dev/null 2>>"$progressfile" || exit 1 + "$prefix/dndpre" -y $distformat $seqtype $model -M 2 -C $numthreads < pre > /dev/null 2>>"$progressfile" || exit 1 fi "$prefix/dvtditr" -W $minimumweight $bunkatsuopt -E $fixthreshold -s $unalignlevel $legacygapopt $mergearg $outnum -C $numthreadsit -t $randomseed $rnaoptit $memopt $scorecalcopt $localparam -z 50 $seqtype $model -f "-"$gop -Q $spfactor -h $aof -I $iterate $weightopt $treeinopt $algoptit $treealg -p $parallelizationstrategy $scoreoutarg -K $nadd < pre > /dev/null 2>>"$progressfile" || exit 1 fi @@ -2711,7 +2797,7 @@ $ownlist" popd > /dev/null - if [ "$outputopt" != "-f" -o "$windows" = "yes" ]; then # Windows deha kaigyo code wo f2cl de modosu. + if [ "$outputopt" != "-f -l -1" -o "$windows" = "yes" ]; then # Windows deha kaigyo code wo f2cl de modosu. # ln -s "$TMPFILE/order" _order$$ # f2cl ga space ari filename ni taiou shiteinainode # cp "$TMPFILE/order" _order$$ # ln -s no error wo sakeru if [ "$outputfile" = "" ]; then @@ -2737,7 +2823,7 @@ $ownlist" fi if [ $distout -eq 1 ]; then - cp "$TMPFILE/hat2" "$infilename.hat2" + cp "$TMPFILE/${distformat}" "$infilename.${distformat}" fi if [ $npickup -ne 0 ]; then diff --git a/software/mafft/mafft-mac/mafftdir/libexec/addsingle b/software/mafft/mafft-mac/mafftdir/libexec/addsingle index 1ff0f8f6..f6500531 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/addsingle and b/software/mafft/mafft-mac/mafftdir/libexec/addsingle differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/contrafoldwrap b/software/mafft/mafft-mac/mafftdir/libexec/contrafoldwrap index 1402942a..4fd93968 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/contrafoldwrap and b/software/mafft/mafft-mac/mafftdir/libexec/contrafoldwrap differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/countlen b/software/mafft/mafft-mac/mafftdir/libexec/countlen index 84789f2a..62850080 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/countlen and b/software/mafft/mafft-mac/mafftdir/libexec/countlen differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/dash_client b/software/mafft/mafft-mac/mafftdir/libexec/dash_client index 86d8de74..e0e22560 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/dash_client and b/software/mafft/mafft-mac/mafftdir/libexec/dash_client differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/disttbfast b/software/mafft/mafft-mac/mafftdir/libexec/disttbfast index 8a1c0147..99dcd2e7 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/disttbfast and b/software/mafft/mafft-mac/mafftdir/libexec/disttbfast differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/dndblast b/software/mafft/mafft-mac/mafftdir/libexec/dndblast index dafe7731..21194f9a 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/dndblast and b/software/mafft/mafft-mac/mafftdir/libexec/dndblast differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/dndfast7 b/software/mafft/mafft-mac/mafftdir/libexec/dndfast7 index 66ebce1c..792960c6 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/dndfast7 and b/software/mafft/mafft-mac/mafftdir/libexec/dndfast7 differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/dndpre b/software/mafft/mafft-mac/mafftdir/libexec/dndpre index 4fe0bc7b..889f2ea4 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/dndpre and b/software/mafft/mafft-mac/mafftdir/libexec/dndpre differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/dvtditr b/software/mafft/mafft-mac/mafftdir/libexec/dvtditr index d72a3d38..ce67fbe4 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/dvtditr and b/software/mafft/mafft-mac/mafftdir/libexec/dvtditr differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/f2cl b/software/mafft/mafft-mac/mafftdir/libexec/f2cl index 461549c1..f74b9f2d 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/f2cl and b/software/mafft/mafft-mac/mafftdir/libexec/f2cl differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/filter b/software/mafft/mafft-mac/mafftdir/libexec/filter index f5c6dd2e..f4cd6089 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/filter and b/software/mafft/mafft-mac/mafftdir/libexec/filter differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/getlag b/software/mafft/mafft-mac/mafftdir/libexec/getlag index 71cdce71..a685e29e 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/getlag and b/software/mafft/mafft-mac/mafftdir/libexec/getlag differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/hex2maffttext b/software/mafft/mafft-mac/mafftdir/libexec/hex2maffttext index d5a9601f..2f84bc69 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/hex2maffttext and b/software/mafft/mafft-mac/mafftdir/libexec/hex2maffttext differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/mafft-distance b/software/mafft/mafft-mac/mafftdir/libexec/mafft-distance index fcd8f50e..8b9e2054 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/mafft-distance and b/software/mafft/mafft-mac/mafftdir/libexec/mafft-distance differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/mafft-profile b/software/mafft/mafft-mac/mafftdir/libexec/mafft-profile index ecb9e93c..82325e96 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/mafft-profile and b/software/mafft/mafft-mac/mafftdir/libexec/mafft-profile differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/maffttext2hex b/software/mafft/mafft-mac/mafftdir/libexec/maffttext2hex index f33f5b60..a4dfcc73 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/maffttext2hex and b/software/mafft/mafft-mac/mafftdir/libexec/maffttext2hex differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/makedirectionlist b/software/mafft/mafft-mac/mafftdir/libexec/makedirectionlist index 53db353a..b117cbbd 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/makedirectionlist and b/software/mafft/mafft-mac/mafftdir/libexec/makedirectionlist differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/mccaskillwrap b/software/mafft/mafft-mac/mafftdir/libexec/mccaskillwrap index ddf86a43..12814622 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/mccaskillwrap and b/software/mafft/mafft-mac/mafftdir/libexec/mccaskillwrap differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/multi2hat3s b/software/mafft/mafft-mac/mafftdir/libexec/multi2hat3s index 5599e514..f6377bc3 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/multi2hat3s and b/software/mafft/mafft-mac/mafftdir/libexec/multi2hat3s differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/nodepair b/software/mafft/mafft-mac/mafftdir/libexec/nodepair index acf55903..53040ef7 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/nodepair and b/software/mafft/mafft-mac/mafftdir/libexec/nodepair differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/pairash b/software/mafft/mafft-mac/mafftdir/libexec/pairash index b64df32f..6d6750c1 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/pairash and b/software/mafft/mafft-mac/mafftdir/libexec/pairash differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/pairlocalalign b/software/mafft/mafft-mac/mafftdir/libexec/pairlocalalign index 80e7b591..0987d920 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/pairlocalalign and b/software/mafft/mafft-mac/mafftdir/libexec/pairlocalalign differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/regtable2seq b/software/mafft/mafft-mac/mafftdir/libexec/regtable2seq index b5c4af47..875250f0 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/regtable2seq and b/software/mafft/mafft-mac/mafftdir/libexec/regtable2seq differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/replaceu b/software/mafft/mafft-mac/mafftdir/libexec/replaceu index 4e7bbbf8..b8a31030 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/replaceu and b/software/mafft/mafft-mac/mafftdir/libexec/replaceu differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/restoreu b/software/mafft/mafft-mac/mafftdir/libexec/restoreu index a8eeac63..0c0c2191 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/restoreu and b/software/mafft/mafft-mac/mafftdir/libexec/restoreu differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/score b/software/mafft/mafft-mac/mafftdir/libexec/score index 66977105..ac0f43cf 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/score and b/software/mafft/mafft-mac/mafftdir/libexec/score differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/seq2regtable b/software/mafft/mafft-mac/mafftdir/libexec/seq2regtable index f283d554..b91999a8 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/seq2regtable and b/software/mafft/mafft-mac/mafftdir/libexec/seq2regtable differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/setcore b/software/mafft/mafft-mac/mafftdir/libexec/setcore index ba798fea..97472142 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/setcore and b/software/mafft/mafft-mac/mafftdir/libexec/setcore differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/setdirection b/software/mafft/mafft-mac/mafftdir/libexec/setdirection index 1f06ecf4..4f670165 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/setdirection and b/software/mafft/mafft-mac/mafftdir/libexec/setdirection differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/sextet5 b/software/mafft/mafft-mac/mafftdir/libexec/sextet5 index 25d8d165..3e44da5c 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/sextet5 and b/software/mafft/mafft-mac/mafftdir/libexec/sextet5 differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/splittbfast b/software/mafft/mafft-mac/mafftdir/libexec/splittbfast index 14741baf..afe1e6f7 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/splittbfast and b/software/mafft/mafft-mac/mafftdir/libexec/splittbfast differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/tbfast b/software/mafft/mafft-mac/mafftdir/libexec/tbfast index 174b5772..133da2b5 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/tbfast and b/software/mafft/mafft-mac/mafftdir/libexec/tbfast differ diff --git a/software/mafft/mafft-mac/mafftdir/libexec/version b/software/mafft/mafft-mac/mafftdir/libexec/version index de766d27..a75621ad 100755 Binary files a/software/mafft/mafft-mac/mafftdir/libexec/version and b/software/mafft/mafft-mac/mafftdir/libexec/version differ diff --git a/software/versions.txt b/software/versions.txt index e4bb821b..71ba35ca 100644 --- a/software/versions.txt +++ b/software/versions.txt @@ -1,13 +1,13 @@ Apache-POI: 3.8 -BLAST+: 2.12.0 +BLAST+: 2.17.0 CD-HIT: 4.8.1 -DIAMOND: 2.0.13 +DIAMOND: 2.1.17 GLPK: 4.40 GLPK (.mexmaca64 only): 5.0 libSBML: 5.20.2 libSBML (.mexmaci64 only): 5.19.0 M2HTML: 1.5 -HMMER: 3.3.2 -MAFFT: 7.490 +HMMER: 3.4.0 +MAFFT: 7.526 WoLFPSORT: v0.2 progressbar: 1.2.0.0 \ No newline at end of file diff --git a/testing/unit_tests/hmmerTests.m b/testing/unit_tests/hmmerTests.m index d17b8e8d..1a3fdf3a 100755 --- a/testing/unit_tests/hmmerTests.m +++ b/testing/unit_tests/hmmerTests.m @@ -21,6 +21,18 @@ function testHmmer(testCase) [ST, I]=dbstack('-completenames'); ravenPath=fileparts(fileparts(fileparts(ST(I).file))); +%Generate temporary names for working directory and outFile +tmpDIR=tempname; +outFile=tempname; + +%Create a temporary folder and copy multi-FASTA file there +[~, ~]=system(['mkdir "' tmpDIR '"']); + +sourceDir = fileparts(which(mfilename)); +copyfile(fullfile(sourceDir,'test_data','yeast_galactosidases.fa'),tmpDIR); +copyfile(fullfile(sourceDir,'test_data','human_galactosidases.fa'),tmpDIR); + + %Identify the operating system if isunix if ismac @@ -29,7 +41,10 @@ function testHmmer(testCase) binEnd=''; end elseif ispc - binEnd='.exe'; + wslPath.tmpDIR = getWSLpath(tmpDIR); + wslPath.hmmbuild = getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild')); + wslPath.hmmsearch = getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch')); + filesep='/'; else dispEM('Unknown OS, exiting.') return @@ -43,29 +58,26 @@ function testHmmer(testCase) expHmmResult.genes = {'sp|P41947|MEL6_YEASX','sp|P41946|MEL5_YEASX', 'sp|P41945|MEL2_YEASX', 'sp|P04824|MEL1_YEASX'}; expHmmResult.scores = [10^-250, 10^-250, 10^-250, 10^-250]; -%Generate temporary names for working directory and outFile -tmpDIR=tempname; -outFile=tempname; - %Run HMMER multi-threaded to use all logical cores assigned to MATLAB cores = evalc('feature(''numcores'')'); cores = strsplit(cores, 'MATLAB was assigned: '); cores = regexp(cores{2},'^\d*','match'); cores = cores{1}; -%Create a temporary folder and copy multi-FASTA file there -[~, ~]=system(['mkdir "' tmpDIR '"']); - -sourceDir = fileparts(which(mfilename)); -copyfile(fullfile(sourceDir,'test_data','yeast_galactosidases.fa'),tmpDIR); -copyfile(fullfile(sourceDir,'test_data','human_galactosidases.fa'),tmpDIR); - %% %Train a hidden Markov model -[~, ~]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +if ismac || isunix + [~, ~]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +else + [~, ~]=system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.tmpDIR '/human_galactosidases.hmm" "' wslPath.tmpDIR '/yeast_galactosidases.fa"']); +end %Run a homology search against the newly-trained HMM -[~, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +if ismac || isunix + [~, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(tmpDIR,'human_galactosidases.hmm') '" "' fullfile(tmpDIR,'yeast_galactosidases.fa') '"']); +else + [~, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.tmpDIR '/human_galactosidases.hmm" "' wslPath.tmpDIR '/yeast_galactosidases.fa"']); +end %Save the output to a file fid=fopen(outFile,'w'); diff --git a/testing/unit_tests/test_data/ecoli_textbook.mat b/testing/unit_tests/test_data/ecoli_textbook.mat index ac202963..67e344ad 100644 Binary files a/testing/unit_tests/test_data/ecoli_textbook.mat and b/testing/unit_tests/test_data/ecoli_textbook.mat differ diff --git a/testing/unit_tests/test_data/expBlastResults.mat b/testing/unit_tests/test_data/expBlastResults.mat index 051bfa72..7b81eac8 100644 Binary files a/testing/unit_tests/test_data/expBlastResults.mat and b/testing/unit_tests/test_data/expBlastResults.mat differ diff --git a/testing/unit_tests/test_data/expCdhitMafftOutput.mat b/testing/unit_tests/test_data/expCdhitMafftOutput.mat index 6d80662d..d13059f6 100644 Binary files a/testing/unit_tests/test_data/expCdhitMafftOutput.mat and b/testing/unit_tests/test_data/expCdhitMafftOutput.mat differ diff --git a/testing/unit_tests/test_data/expDiamondResults.mat b/testing/unit_tests/test_data/expDiamondResults.mat index d064d42c..c242a16a 100644 Binary files a/testing/unit_tests/test_data/expDiamondResults.mat and b/testing/unit_tests/test_data/expDiamondResults.mat differ diff --git a/testing/unit_tests/test_data/importExportResults.mat b/testing/unit_tests/test_data/importExportResults.mat index 83f86ac8..4a3f9c35 100644 Binary files a/testing/unit_tests/test_data/importExportResults.mat and b/testing/unit_tests/test_data/importExportResults.mat differ diff --git a/testing/unit_tests/test_data/miriamTestOutput.mat b/testing/unit_tests/test_data/miriamTestOutput.mat index d5fc3b09..8035ad4e 100644 Binary files a/testing/unit_tests/test_data/miriamTestOutput.mat and b/testing/unit_tests/test_data/miriamTestOutput.mat differ diff --git a/testing/unit_tests/test_data/solverTestOutput.mat b/testing/unit_tests/test_data/solverTestOutput.mat index a0e8dd76..aaec2fc2 100644 Binary files a/testing/unit_tests/test_data/solverTestOutput.mat and b/testing/unit_tests/test_data/solverTestOutput.mat differ diff --git a/tutorial/empty.mat b/tutorial/empty.mat index 936aead1..7e05da08 100644 Binary files a/tutorial/empty.mat and b/tutorial/empty.mat differ diff --git a/tutorial/iMK1208+suppInfo.mat b/tutorial/iMK1208+suppInfo.mat index 9aa45a08..86a5fc7c 100644 Binary files a/tutorial/iMK1208+suppInfo.mat and b/tutorial/iMK1208+suppInfo.mat differ diff --git a/tutorial/pathway.mat b/tutorial/pathway.mat index bc01714f..5ffd49bc 100644 Binary files a/tutorial/pathway.mat and b/tutorial/pathway.mat differ diff --git a/tutorial/pcPathway.mat b/tutorial/pcPathway.mat index acfa80b5..d696b633 100644 Binary files a/tutorial/pcPathway.mat and b/tutorial/pcPathway.mat differ