OpenMS  2.5.0
MzTabFile.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2020.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Timo Sachsenberg $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <OpenMS/FORMAT/MzTab.h>
38 
43 
44 #include <boost/math/special_functions/fpclassify.hpp>
45 
46 #include <vector>
47 #include <algorithm>
48 
49 namespace OpenMS
50 {
51  class String;
52  class SVOutStream;
58  class OPENMS_DLLAPI MzTabFile
59  {
60  public:
62  MzTabFile();
64  ~MzTabFile();
65 
66  typedef std::map<std::pair<String, String>, std::vector<PeptideHit> > MapAccPepType;
67 
68  // store MzTab file
69  void store(const String& filename, const MzTab& mz_tab) const;
70 
71  // Set store behaviour of optional "reliability" and "uri" columns (default=no)
72  void storeProteinReliabilityColumn(bool store);
73  void storePeptideReliabilityColumn(bool store);
74  void storePSMReliabilityColumn(bool store);
75  void storeSmallMoleculeReliabilityColumn(bool store);
76  void storeProteinUriColumn(bool store);
77  void storePeptideUriColumn(bool store);
78  void storePSMUriColumn(bool store);
79  void storeSmallMoleculeUriColumn(bool store);
80  void storeProteinGoTerms(bool store);
81 
82  // load MzTab file
83  void load(const String& filename, MzTab& mz_tab);
84 
85  protected:
102 
103  void generateMzTabMetaDataSection_(const MzTabMetaData& map, StringList& sl) const;
104 
105  String generateMzTabProteinHeader_(const MzTabProteinSectionRow& reference_row, const Size n_best_search_engine_scores, const std::vector<String>& optional_columns) const;
106 
107  String generateMzTabSectionRow_(const MzTabProteinSectionRow& row, const std::vector<String>& optional_columns) const;
108 
109  String generateMzTabPeptideHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, Size assays, Size study_variables, const std::vector<String>& optional_columns) const;
110 
111  String generateMzTabSectionRow_(const MzTabPeptideSectionRow& row, const std::vector<String>& optional_columns) const;
112 
113  String generateMzTabPSMHeader_(Size n_search_engine_scores, const std::vector<String>& optional_columns) const;
114 
115  String generateMzTabSectionRow_(const MzTabPSMSectionRow& row, const std::vector<String>& optional_columns) const;
116 
117  String generateMzTabSmallMoleculeHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, Size assays, Size study_variables, const std::vector<String>& optional_columns) const;
118 
119  String generateMzTabSectionRow_(const MzTabSmallMoleculeSectionRow& row, const std::vector<String>& optional_columns) const;
120 
121  String generateMzTabNucleicAcidHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_scores, const std::vector<String>& optional_columns) const;
122 
123  String generateMzTabSectionRow_(const MzTabNucleicAcidSectionRow& row, const std::vector<String>& optional_columns) const;
124 
125  String generateMzTabOligonucleotideHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, const std::vector<String>& optional_columns) const;
126 
127  String generateMzTabSectionRow_(const MzTabOligonucleotideSectionRow& row, const std::vector<String>& optional_columns) const;
128 
129  String generateMzTabOSMHeader_(Size n_search_engine_scores, const std::vector<String>& optional_columns) const;
130 
131  String generateMzTabSectionRow_(const MzTabOSMSectionRow& row, const std::vector<String>& optional_columns) const;
132 
134  template <typename SectionRow> void generateMzTabSection_(const std::vector<SectionRow>& rows, const std::vector<String>& optional_columns, StringList& output) const
135  {
136  output.reserve(output.size() + rows.size() + 1);
137  for (const auto& row : rows)
138  {
139  output.push_back(generateMzTabSectionRow_(row, optional_columns));
140  }
141  output.push_back(String("\n"));
142  }
143 
144  // auxiliary functions
145 
147  static void addOptionalColumnsToSectionRow_(const std::vector<String>& column_names, const std::vector<MzTabOptionalColumnEntry>& column_entries, StringList& output);
148 
149  // extract two integers from string (e.g. search_engine_score[1]_ms_run[2] -> 1,2)
150  static std::pair<int, int> extractIndexPairsFromBrackets_(const String& s);
151 
152  static void sortPSM_(std::vector<PeptideIdentification>::iterator begin, std::vector<PeptideIdentification>::iterator end);
153 
154  static void keepFirstPSM_(std::vector<PeptideIdentification>::iterator begin, std::vector<PeptideIdentification>::iterator end);
155 
157  static void partitionIntoRuns_(const std::vector<PeptideIdentification>& pep_ids,
158  const std::vector<ProteinIdentification>& pro_ids,
159  std::map<String, std::vector<PeptideIdentification> >& map_run_to_pepids,
160  std::map<String, std::vector<ProteinIdentification> >& map_run_to_proids
161  );
162 
163 
165  static void createProteinToPeptideLinks_(const std::map<String, std::vector<PeptideIdentification> >& map_run_to_pepids, MapAccPepType& map_run_accession_to_pephits);
166 
168  static String extractProteinAccession_(const PeptideHit& peptide_hit);
169 
171  static String extractPeptideModifications_(const PeptideHit& peptide_hit);
172 
174  static String mapSearchEngineToCvParam_(const String& openms_search_engine_name);
175 
176  static String mapSearchEngineScoreToCvParam_(const String& openms_search_engine_name, double score, String score_type);
177 
178  static String extractNumPeptides_(const String& common_identifier, const String& protein_accession,
179  const MapAccPepType& map_run_accession_to_peptides);
180 
181  // mzTab definition of distinct
182  static String extractNumPeptidesDistinct_(String common_identifier, String protein_accession,
183  const MapAccPepType& map_run_accession_to_peptides);
184 
185  // same as distinct but additional constraint of uniqueness (=maps to exactly one Protein)
186  static String extractNumPeptidesUnambiguous_(String common_identifier, String protein_accession,
187  const MapAccPepType& map_run_accession_to_peptides);
188 
189  static std::map<String, Size> extractNumberOfSubSamples_(const std::map<String, std::vector<ProteinIdentification> >& map_run_to_proids);
190 
191  static void writePeptideHeader_(SVOutStream& output, std::map<String, Size> n_sub_samples);
192 
193  static void writeProteinHeader_(SVOutStream& output, std::map<String, Size> n_sub_samples);
194 
195  static void writeProteinData_(SVOutStream& output,
196  const ProteinIdentification& prot_id,
197  Size run_count,
198  String input_filename,
199  bool has_coverage,
200  const MapAccPepType& map_run_accession_to_peptides,
201  const std::map<String, Size>& map_run_to_num_sub
202  );
203 
204  };
205 
206 } // namespace OpenMS
207 
void generateMzTabSection_(const std::vector< SectionRow > &rows, const std::vector< String > &optional_columns, StringList &output) const
Generate an mzTab section comprising multiple rows of the same type.
Definition: MzTabFile.h:134
File adapter for MzTab files.
Definition: MzTabFile.h:58
bool store_peptide_reliability_
Definition: MzTabFile.h:87
bool store_protein_uri_
Definition: MzTabFile.h:90
bool store_osm_reliability_
Definition: MzTabFile.h:97
std::map< std::pair< String, String >, std::vector< PeptideHit > > MapAccPepType
Definition: MzTabFile.h:66
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
bool store_protein_reliability_
Definition: MzTabFile.h:86
bool store_osm_uri_
Definition: MzTabFile.h:100
bool store_nucleic_acid_goterms_
Definition: MzTabFile.h:101
bool store_psm_uri_
Definition: MzTabFile.h:92
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:774
SML Small molecule section (table based)
Definition: MzTab.h:709
NUC - Nucleic acid section (table-based)
Definition: MzTab.h:739
PSM - PSM section (Table based)
Definition: MzTab.h:665
PEP - Peptide section (Table based)
Definition: MzTab.h:628
OSM - OSM (oligonucleotide-spectrum match) section (table-based)
Definition: MzTab.h:809
bool store_peptide_uri_
Definition: MzTabFile.h:91
bool store_psm_reliability_
Definition: MzTabFile.h:88
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
Representation of a protein identification run.
Definition: ProteinIdentification.h:71
PRT - Protein section (Table based)
Definition: MzTab.h:589
Data model of MzTab files. Please see the official MzTab specification at https://code.google.com/p/mztab/.
Definition: MzTab.h:855
bool store_smallmolecule_reliability_
Definition: MzTabFile.h:89
bool store_smallmolecule_uri_
Definition: MzTabFile.h:93
Representation of a peptide hit.
Definition: PeptideHit.h:54
A more convenient string class.
Definition: String.h:58
bool store_protein_goterms_
Definition: MzTabFile.h:94
bool store_oligonucleotide_reliability_
Definition: MzTabFile.h:96
all meta data of a mzTab file. Please refer to specification for documentation.
Definition: MzTab.h:524
bool store_nucleic_acid_reliability_
Definition: MzTabFile.h:95
bool store_nucleic_acid_uri_
Definition: MzTabFile.h:98
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
bool store_oligonucleotide_uri_
Definition: MzTabFile.h:99
Stream class for writing to comma/tab/...-separated values files.
Definition: SVOutStream.h:54