Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ mafft binary
.gitattributes export-ignore
.gitignore export-ignore
.github export-ignore
*.mat filter=lfs diff=lfs merge=lfs -text
1,031 changes: 523 additions & 508 deletions doc/external/kegg/getKEGGModelForOrganism.html

Large diffs are not rendered by default.

485 changes: 243 additions & 242 deletions doc/external/kegg/getModelFromKEGG.html

Large diffs are not rendered by default.

859 changes: 430 additions & 429 deletions doc/external/kegg/getRxnsFromKEGG.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion doc/io/exportModel.html
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ <h2><a name="_source"></a>SOURCE CODE <a href="#_top"><img alt="^" border="0" sr
0568 model.subSystems = cellfun(@(c) cellfun(@char, c, <span class="string">'UniformOutput'</span>, false), model.subSystems, <span class="string">'UniformOutput'</span>, false);
0569
0570 <span class="comment">% === 2) Flatten once: names and their reaction indices (vectorized) ===</span>
0571 flatNames = [model.subSystems{:}]; <span class="comment">% 1×M cellstr of all subsystem labels</span>
0571 flatNames = vertcat(model.subSystems{:}); <span class="comment">% 1×M cellstr of all subsystem labels</span>
0572 <span class="keyword">if</span> isempty(flatNames)
0573 <span class="comment">% Nothing to do: no subsystems present</span>
0574 <span class="keyword">return</span>
Expand Down
244 changes: 128 additions & 116 deletions doc/testing/unit_tests/hmmerTests.html

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions doc/tutorial/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ <h2>Matlab files in this directory:</h2>
<h2>Other Matlab-specific files in this directory:</h2>
<ul style="list-style-image:url(../matlabicon.gif)">
<li>empty.mat</li><li>iMK1208+suppInfo.mat</li><li>pathway.mat</li><li>pcPathway.mat</li></ul>
<h2>Subsequent directories:</h2>
<ul style="list-style-image:url(../matlabicon.gif)">
<li>struct_conversion</li></ul>


<hr><address>Generated by <strong><a href="http://www.artefact.tk/software/matlab/m2html/" title="Matlab Documentation in HTML">m2html</a></strong> &copy; 2005</address>
</body>
Expand Down
125 changes: 70 additions & 55 deletions external/kegg/getKEGGModelForOrganism.m
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
% The hidden Markov models as generated in 2b or
% downloaded from BioMet Toolbox (see below)
% The final directory in dataDir should be styled as
% prok90_kegg105 or euk90_kegg105, indicating whether
% prok90_kegg116 or euk90_kegg116, indicating whether
% the HMMs were trained on pro- or eukaryotic
% sequences; using which sequence similarity treshold
% (first set of digits); using which KEGG version
Expand Down Expand Up @@ -99,12 +99,12 @@
% If -1 is provided, CD-HIT is skipped (optional, default 0.9)
% globalModel structure containing both model and KOModel
% structures as generated by getModelFromKEGG. These
% will otherwise be loaded by via getModelFromKEGG.
% will otherwise be loaded by via getModelFromKEGG.
% Providing globalKEGGmodel can speed up model
% generation if getKEGGModelForOrganism is run
% multiple times for different strains. Example:
% [globalModel.model,globalModel.KOModel] = getModelFromKEGG;
% (optional, default empty, global model is loaded by
% (optional, default empty, global model is loaded by
% getModelFromKEGG)
%
% Output:
Expand Down Expand Up @@ -259,36 +259,36 @@
else
outDir=char(outDir);
end
if nargin<5
if nargin<5 || isempty(keepSpontaneous)
keepSpontaneous=true;
end
if nargin<6
if nargin<6 || isempty(keepUndefinedStoich)
keepUndefinedStoich=true;
end
if nargin<7
if nargin<7 || isempty(keepIncomplete)
keepIncomplete=true;
end
if nargin<8
if nargin<8 || isempty(keepGeneral)
keepGeneral=false;
end
if nargin<9
if nargin<9 || isempty(cutOff)
cutOff=10^-50;
end
if nargin<10
if nargin<10 || isempty(minScoreRatioKO)
minScoreRatioKO=0.3;
end
if nargin<11
if nargin<11 || isempty(minScoreRatioG)
minScoreRatioG=0.8;
end
if nargin<12
if nargin<12 || isempty(maxPhylDist)
maxPhylDist=inf;
%Include all sequences for each reaction
end
if nargin<13
if nargin<13 || isempty(nSequences)
nSequences=inf;
%Include all sequences for each reaction
end
if nargin<14
if nargin<14 || isempty(seqIdentity)
seqIdentity=0.9;
end

Expand All @@ -315,9 +315,9 @@
%required zip file already in working directory or have it extracted. If
%the zip file and directory is not here, it is downloaded from the cloud
if ~isempty(dataDir)
hmmOptions={'euk90_kegg105','prok90_kegg105'};
hmmOptions={'euk90_kegg116','prok90_kegg116'};
if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions.
%If not, then check whether the required folders exist anyway.
%If not, then check whether the required folders exist anyway.
if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ...
~isfolder(fullfile(dataDir,'fasta')) && ...
~isfolder(fullfile(dataDir,'aligned')) && ...
Expand All @@ -339,14 +339,14 @@
else
fprintf('Downloading the HMMs archive file... ');
try
websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']);
websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.11.0/',hmmOptions{hmmIndex},'.zip']);
catch ME
if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError')
error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues')
end
end
end

fprintf('COMPLETE\n');
fprintf('Extracting the HMMs archive file... ');
unzip([dataDir,'.zip']);
Expand Down Expand Up @@ -406,7 +406,7 @@
if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes'])
error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.');
end

fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']);
if ismember(organismID,{'eukaryotes','prokaryotes'})
phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1);
Expand Down Expand Up @@ -552,6 +552,20 @@
return
end

tmpFile=tempname;
%On Windows, paths need to be translated to Unix before parsing it to WSL
if ispc
wslPath.tmpFile=getWSLpath(tmpFile);
%mafft has problems writing to terminal (/dev/stderr) when running
%on WSL via MATLAB, instead write and read progress file
mafftOutput = tempname;
wslPath.mafftOutput=getWSLpath(mafftOutput);
wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat'));
wslPath.hmmbuild=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild'));
wslPath.hmmsearch=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch'));
wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit'));
end

%Check if alignment of FASTA files should be performed
missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]);
if ~isempty(missingAligned)
Expand All @@ -561,18 +575,7 @@
fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete');
end
missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned)));
tmpFile=tempname;
%On Windows, paths need to be translated to Unix before parsing it to WSL
if ispc
wslPath.tmpFile=getWSLpath(tmpFile);
%mafft has problems writing to terminal (/dev/stderr) when running
%on WSL via MATLAB, instead write and read progress file
mafftOutput = tempname;
wslPath.mafftOutput=getWSLpath(mafftOutput);
wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat'));
wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit'));
end


for i=1:numel(missingAligned)
%This is checked here because it could be that it is created by a
%parallel process. The faw-files are saved as temporary files to
Expand All @@ -587,7 +590,7 @@
dispEM(EM,false);
continue;
end

%If the multi-FASTA file is empty then save an empty aligned
%file and continue
s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa']));
Expand All @@ -596,17 +599,17 @@
fclose(fid);
continue;
end

%Create an empty file to prevent other threads to start to work
%on the same alignment
fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w');
fclose(fid);

%First load the FASTA file, then select up to nSequences
%sequences of the most closely related species, apply any
%constraints from maxPhylDist, and save it as a temporary file,
%and create the model from that

fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa']));
phylDist=inf(numel(fastaStruct),1);
for j=1:numel(fastaStruct)
Expand All @@ -620,24 +623,24 @@
end
end
end

%Inf means that it should not be included
phylDist(phylDist>maxPhylDist)=[];

%Sort based on phylDist
[~, order]=sort(phylDist);

%Save the first nSequences hits to a temporary FASTA file
if nSequences<=numel(fastaStruct)
fastaStruct=fastaStruct(order(1:nSequences));
else
fastaStruct=fastaStruct(order);
end

%Do the clustering and alignment if there are more than one
%sequences, otherwise just save the sequence (or an empty file)
if numel(fastaStruct)>1
if seqIdentity~=-1
if seqIdentity~=-1
cdhitInpCustom=tempname;
fastawrite(cdhitInpCustom,fastaStruct);
if seqIdentity<=1 && seqIdentity>0.7
Expand Down Expand Up @@ -712,7 +715,7 @@
end
%Move the temporary file to the real one
movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f');

%Print the progress every 25 files
if rem(i-1,25) == 0
progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns)));
Expand Down Expand Up @@ -750,7 +753,7 @@
dispEM(EM,false);
continue;
end

%If the multi-FASTA file is empty then save an empty aligned
%file and continue
s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']));
Expand All @@ -763,14 +766,20 @@
%KO. This is because hmmbuild cannot overwrite existing files
fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w');
fclose(fid);

%Create HMM
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']);
if ismac || isunix
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']);
else
wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']));
wslPath.alignFile = getWSLpath(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']));
[status, output] = system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.alignFile '"']);
end
if status~=0
EM=['Error when training HMM for ' missingHMMs{i} ':\n' output];
dispEM(EM);
end

%Delete the temporary file
delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']));

Expand Down Expand Up @@ -805,30 +814,36 @@
dispEM(EM,false);
continue;
end

%Save an empty file to prevent several threads working on the
%same file
fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w');
fclose(fid);

%If the HMM file is empty then save an out file and continue
s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']));
if s.bytes<=0
continue;
end

%Check each gene in the input file against this model
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']);
if ismac || isunix
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']);
else
wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']));
wslPath.fastaFile = getWSLpath(fastaFile);
[status, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.fastaFile '"']);
end
if status~=0
EM=['Error when querying HMM for ' missingOUT{i} ':\n' output];
dispEM(EM);
end

%Save the output to a file
fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w');
fwrite(fid,output);
fclose(fid);

%Print the progress every 25 files
if rem(i-1,25) == 0
progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns)));
Expand Down Expand Up @@ -861,16 +876,16 @@
while 1
%Get the next line
tline = fgetl(fid);

%Abort at end of file
if ~ischar(tline)
break;
end

if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------'))
break;
end

if beginMatches==false
%This is how the listing of matches begins
if any(strfind(tline,'E-value '))
Expand All @@ -883,7 +898,7 @@
if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline)
elements=regexp(tline,' ','split');
elements=elements(cellfun(@any,elements));

%Check if the match is below the treshhold
score=str2double(elements{1});
gene=elements{9};
Expand Down Expand Up @@ -952,7 +967,7 @@
%Find the KOs and the corresponding genes
J=ismember(KOModel.rxns,KOs);
[~, K]=find(koGeneMat(J,:));

if any(K)
model.rxnGeneMat(i,K)=1;
%Also delete KOs for which no genes were found. If no genes at
Expand Down
5 changes: 3 additions & 2 deletions external/kegg/getModelFromKEGG.m
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@
% Usage: [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,...
% keepUndefinedStoich,keepIncomplete,keepGeneral)

ravenPath=findRAVENroot();

if nargin<1
keggPath='RAVEN/external/kegg';
keggPath=fullfile(ravenPath,'external','kegg');
else
keggPath=char(keggPath);
end
Expand All @@ -62,7 +64,6 @@
keepGeneral=false;
end

ravenPath=findRAVENroot();
modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat');
if exist(modelFile, 'file') && isNewestFile(ravenPath)
fprintf(['Importing the global KEGG model from ' strrep(modelFile,'\','/') '... ']);
Expand Down
5 changes: 3 additions & 2 deletions external/kegg/getRxnsFromKEGG.m
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,16 @@
% (except for '///')
%

ravenPath=findRAVENroot();

if nargin<1
keggPath='RAVEN/external/kegg';
keggPath=fullfile(ravenPath,'external','kegg');
else
keggPath=char(keggPath);
end

%Check if the reactions have been parsed before and saved. If so, load the
%model
ravenPath=findRAVENroot();
rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat');
if exist(rxnsFile, 'file')
fprintf(['Importing KEGG reactions from ' strrep(rxnsFile,'\','/') '... ']);
Expand Down
Loading
Loading