summaryrefslogblamecommitdiff
path: root/kaldi_io/src/kaldi/util/kaldi-table.h
blob: 6f6cb98661fb9bcbc2871420e5860fbd5b54cc7b (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459










































































































































































































































































































































































































































































                                                                                         
// util/kaldi-table.h

// Copyright 2009-2011    Microsoft Corporation
//                2013    Johns Hopkins University (author: Daniel Povey)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_UTIL_KALDI_TABLE_H_
#define KALDI_UTIL_KALDI_TABLE_H_

#include <string>
#include <vector>
#include <utility>

#include "base/kaldi-common.h"
#include "util/kaldi-holder.h"

namespace kaldi {

// Forward declarations
template<class Holder> class RandomAccessTableReaderImplBase;
template<class Holder>  class SequentialTableReaderImplBase;
template<class Holder>  class TableWriterImplBase;

/// \addtogroup table_group
/// @{

// This header defines the Table classes (RandomAccessTableReader,
// SequentialTableReader and TableWriter) and explains what the Holder classes,
// which the Table class requires as a template argument, are like.  It also
// explains the "rspecifier" and "wspecifier" concepts (these are strings that
// explain how to read/write objects via archives or scp files.  A table is
// conceptually a collection of objects of a particular type T indexed by keys
// of type std::string (these Keys additionally have an order within each table).
// The Table classes are templated on a type (call it Holder) such that Holder::T
// is a typedef equal to T.

// see kaldi-holder.h for detail on the Holder classes.

typedef std::vector<std::string> KeyList;

// Documentation for "wspecifier"
// "wspecifier" describes how we write a set of objects indexed by keys.
// The basic, unadorned wspecifiers are as follows:
//
//  ark:wxfilename
//  scp:rxfilename
//  ark,scp:filename,wxfilename
//  ark,scp:filename,wxfilename
//
//
//  We also allow the following modifiers:
//  t means text mode.
//  b means binary mode.
//  f means flush the stream after writing each entry.
//   (nf means don't flush, and isn't very useful as the default is to flush).
//  p means permissive mode, when writing to an "scp" file only: will ignore
//     missing scp entries, i.e. won't write anything for those files but will
//     return success status).
//
//  So the following are valid wspecifiers:
//  ark,b,f:foo
//  "ark,b,b:| gzip -c > foo"
//  "ark,scp,t,nf:foo.ark,|gzip -c > foo.scp.gz"
//  ark,b:-
//
//  The meanings of rxfilename and wxfilename are as described in
//  kaldi-stream.h (they are filenames but include pipes, stdin/stdout
//  and so on; filename is a regular filename.
//

//  The ark:wxfilename type of wspecifier instructs the class to
//  write directly to an archive.  For small objects (e.g. lists of ints),
//  the text archive format will generally be human readable with one line
//  per entry in the archive.
//
//  The type "scp:xfilename" refers to an scp file which should
//  already exist on disk, and tells us where to write the data for
//  each key (usually an actual file); each line of the scp file
//  would be:
//   key xfilename
//
//  The type ark,scp:filename,wxfilename means
//  we write both an archive and an scp file that specifies offsets into the
//  archive, with lines like:
//    key filename:12407
//  where the number is the byte offset into the file.
//  In this case we restrict the archive-filename to be an actual filename,
//  as we can't see a situtation where an extended filename would make sense
//  for this (we can't fseek() in pipes).

enum WspecifierType  {
  kNoWspecifier,
  kArchiveWspecifier,
  kScriptWspecifier,
  kBothWspecifier
};

struct WspecifierOptions {
  bool binary;
  bool flush;
  bool permissive; // will ignore absent scp entries.
  WspecifierOptions(): binary(true), flush(false), permissive(false) { }
};

// ClassifyWspecifier returns the type of the wspecifier string,
// and (if pointers are non-NULL) outputs the extra information
// about the options, and the script and archive
// filenames.
WspecifierType ClassifyWspecifier(const std::string &wspecifier,
                                  std::string *archive_wxfilename,
                                  std::string *script_wxfilename,
                                  WspecifierOptions *opts);

// ReadScriptFile reads an .scp file in its entirety, and appends it
// (in order as it was in the scp file) in script_out_, which contains
// pairs of (key, xfilename).  The .scp
// file format is: on each line, key xfilename
// where xfilename means rxfilename or wxfilename, and may contain internal spaces
// (we trim away any leading or trailing space).  The key is space-free.
// ReadScriptFile returns true if the format was valid (empty files
// are valid).
// If 'print_warnings', it will print out warning messages that explain what kind
// of error there was.
bool ReadScriptFile(const std::string &rxfilename,
                    bool print_warnings,
                    std::vector<std::pair<std::string, std::string> > *script_out);

// This version of ReadScriptFile works from an istream.
bool ReadScriptFile(std::istream &is,
                    bool print_warnings,
                    std::vector<std::pair<std::string, std::string> > *script_out);

// Writes, for each entry in script, the first element, then ' ', then the second
// element then '\n'.  Checks that the keys (first elements of pairs) are valid
// tokens (nonempty, no whitespace), and the values (second elements of pairs)
// are newline-free and contain no leading or trailing space.  Returns true on
// success.
bool WriteScriptFile(const std::string &wxfilename,
                     const std::vector<std::pair<std::string, std::string> > &script);

// This version writes to an ostream.
bool WriteScriptFile(std::ostream &os,
                     const std::vector<std::pair<std::string, std::string> > &script);

// Documentation for "rspecifier"
// "rspecifier" describes how we read a set of objects indexed by keys.
// The possibilities are:
//
// ark:rxfilename
// scp:rxfilename
//
// We also allow various modifiers:
//   o   means the program will only ask for each key once, which enables
//       the reader to discard already-asked-for values.
//   s   means the keys are sorted on input (means we don't have to read till
//       eof if someone asked for a key that wasn't there).
//   cs  means that it is called in sorted order (we are generally asserting this
//       based on knowledge of how the program works).
//   p   means "permissive", and causes it to skip over keys whose corresponding
//       scp-file entries cannot be read. [and to ignore errors in archives and
//       script files, and just consider the "good" entries].
//       We allow the negation of the options above, as in no, ns, np,
//       but these aren't currently very useful (just equivalent to omitting the
//       corresponding option).
//      [any of the above options can be prefixed by n to negate them, e.g. no, ns,
//       ncs, np; but these aren't currently useful as you could just omit the option].
//
//   b   is ignored [for scripting convenience]
//   t   is ignored [for scripting convenience]
//
//
//  So for instance the following would be a valid rspecifier:
//
//   "o, s, p, ark:gunzip -c foo.gz|"

struct  RspecifierOptions {
  // These options only make a difference for the RandomAccessTableReader class.
  bool once;   // we assert that the program will only ask for each key once.
  bool sorted;  // we assert that the keys are sorted.
  bool called_sorted;  // we assert that the (HasKey(), Value() functions will
  // also be called in sorted order.  [this implies "once" but not vice versa].
  bool permissive;  // If "permissive", when reading from scp files it treats
  // scp files that can't be read as if the corresponding key were not there.
  // For archive files it will suppress errors getting thrown if the archive
  
  // is corrupted and can't be read to the end.

  RspecifierOptions(): once(false), sorted(false),
                       called_sorted(false), permissive(false) { }
};

enum RspecifierType  {
  kNoRspecifier,
  kArchiveRspecifier,
  kScriptRspecifier
};

RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename,
                                  RspecifierOptions *opts);

// Class Table<Holder> is useful when you want the entire set of
// objects in memory.  NOT IMPLEMENTED YET.
// It is the least scalable way of accessing data in Tables.
// The *TableReader and TableWriter classes are more scalable.


/// Allows random access to a collection
/// of objects in an archive or script file; see \ref io_sec_tables.
template<class Holder>
class RandomAccessTableReader {
 public:
  typedef typename Holder::T T;

  RandomAccessTableReader(): impl_(NULL) { }

  // This constructor equivalent to default constructor + "open", but
  // throws on error.
  RandomAccessTableReader(const std::string &rspecifier);

  // Opens the table.
  bool Open(const std::string &rspecifier);

  // Returns true if table is open.
  bool IsOpen() const { return (impl_ != NULL); }

  // Close() will close the table [throws if it was not open],
  // and returns true on success (false if we were reading an
  // archive and we discovered an error in the archive).
  bool Close();

  // Says if it has this key.
  // If you are using the "permissive" (p) read option,
  // it will return false for keys whose corresponding entry
  // in the scp file cannot be read.

  bool HasKey(const std::string &key);

  // Value() may throw if you are reading an scp file, you
  // do not have the "permissive" (p) option, and an entry
  // in the scp file cannot be read.  Typically you won't
  // want to catch this error.
  const T &Value(const std::string &key);

  ~RandomAccessTableReader();

  // Allow copy-constructor only for non-opened readers (needed for inclusion in
  // stl vector)
  RandomAccessTableReader(const RandomAccessTableReader<Holder> &other):
      impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
 private:
  // Disallow assignment.
  RandomAccessTableReader &operator=(const RandomAccessTableReader<Holder>&);
  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
                          // message and dies (with KALDI_ERR) if NULL.
  RandomAccessTableReaderImplBase<Holder> *impl_;
};



/// A templated class for reading objects sequentially from an archive or script
/// file; see \ref io_sec_tables.
template<class Holder>
class SequentialTableReader {
 public:
  typedef typename Holder::T T;

  SequentialTableReader(): impl_(NULL) { }

  // This constructor equivalent to default constructor + "open", but
  // throws on error.
  SequentialTableReader(const std::string &rspecifier);

  // Opens the table.  Returns exit status; but does throw if previously
  // open stream was in error state.  Call Close to stop this [anyway,
  // calling Open more than once is not recommended.]
  bool Open(const std::string &rspecifier);

  // Returns true if we're done.  It will also return true if there's some kind
  // of error and we can't read any more; in this case, you can detect the
  // error by calling Close and checking the return status; otherwise
  // the destructor will throw.
  inline bool Done();

  // Only valid to call Key() if Done() returned false.
  inline std::string Key();

  // FreeCurrent() is provided as an optimization to save memory, for large
  // objects.  It instructs the class to deallocate the current value. The
  // reference Value() will/ be invalidated by this.

  void FreeCurrent();

  // Return reference to the current value.
  // The reference is valid till next call to this object.
  // If will throw if you are reading an scp file, did not
  // specify the "permissive" (p) option and the file cannot
  // be read.  [The permissive option makes it behave as if that
  // key does not even exist, if the corresponding file cannot be
  // read.]  You probably wouldn't want to catch this exception;
  // the user can just specify the p option in the rspecifier.
  const T &Value();

  // Next goes to the next key.  It will not throw; any error will
  // result in Done() returning true, and then the destructor will
  // throw unless you call Close().
  void Next();

  // Returns true if table is open for reading (does not imply
  // stream is in good state).
  bool IsOpen() const;

  // Close() will return false (failure) if Done() became true
  // because of an error/ condition rather than because we are
  // really done [e.g. because of an error or early termination
  // in the archive].
  // If there is an error and you don't call Close(), the destructor
  // will fail.
  // Close()
  bool Close();

  // The destructor may throw.  This is the desired behaviour, as it's the way we
  // signal the error to the user (to detect it, call Close().  The issue is that
  // otherwise the user has no way to tell whether Done() returned true because
  // we reached the end of the archive or script, or because there was an error
  // that prevented further reading.
  ~SequentialTableReader();

  // Allow copy-constructor only for non-opened readers (needed for inclusion in
  // stl vector)
  SequentialTableReader(const SequentialTableReader<Holder> &other):
      impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
 private:
  // Disallow assignment.
  SequentialTableReader &operator = (const SequentialTableReader<Holder>&); 
  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
                          // message and dies (with KALDI_ERR) if NULL.
  SequentialTableReaderImplBase<Holder> *impl_;
};


/// A templated class for writing objects to an
/// archive or script file; see \ref io_sec_tables.
template<class Holder>
class TableWriter {
 public:
  typedef typename Holder::T T;

  TableWriter(): impl_(NULL) { }

  // This constructor equivalent to default constructor
  // + "open", but throws on error.  See docs for
  // wspecifier above.
  TableWriter(const std::string &wspecifier);

  // Opens the table.  See docs for wspecifier above.
  // If it returns true, it is open.
  bool Open(const std::string &wspecifier);

  // Returns true if open for writing.
  bool IsOpen() const;

  // Write the object.  Throws  std::runtime_error on error (via the
  // KALDI_ERR macro)
  inline void Write(const std::string &key, const T &value) const;


  // Flush will flush any archive; it does not return error status
  // or throw, any errors will be reported on the next Write or Close.
  // Useful if we may be writing to a command in a pipe and want
  // to ensure good CPU utilization.
  void Flush();

  // Close() is not necessary to call, as the destructor
  // closes it; it's mainly useful if you want to handle
  // error states because the destructor will throw on
  // error if you do not call Close().
  bool Close();

  ~TableWriter();
  
  // Allow copy-constructor only for non-opened writers (needed for inclusion in
  // stl vector)
  TableWriter(const TableWriter &other): impl_(NULL) {
    KALDI_ASSERT(other.impl_ == NULL);
  }
 private:
  TableWriter &operator = (const TableWriter&); // Disallow assignment.
  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
                          // message and dies (with KALDI_ERR) if NULL.
  TableWriterImplBase<Holder> *impl_;
};


/// This class is for when you are reading something in random access, but
/// it may actually be stored per-speaker (or something similar) but the 
/// keys you're using are per utterance.  So you also provide an "rxfilename"
/// for a file containing lines like
/// utt1 spk1
/// utt2 spk1
/// utt3 spk1
/// and so on.  Note: this is optional; if it is an empty string, we just won't
/// do the mapping.  Also, "table_rxfilename" may be the empty string (as for
/// a regular table), in which case the table just won't be opened.
/// We provide only the most frequently used of the functions of RandomAccessTableReader.

template<class Holder>
class RandomAccessTableReaderMapped {
 public:
  typedef typename Holder::T T;
  /// Note: "utt2spk_rxfilename" will in the normal case be an rxfilename
  /// for an utterance to speaker map, but this code is general; it accepts
  /// a generic map.
  RandomAccessTableReaderMapped(const std::string &table_rxfilename,
                                const std::string &utt2spk_rxfilename);

  RandomAccessTableReaderMapped() {};

  /// Note: when calling Open, utt2spk_rxfilename may be empty.
  bool Open(const std::string &table_rxfilename,
            const std::string &utt2spk_rxfilename);

  bool HasKey(const std::string &key);
  const T &Value(const std::string &key);
  inline bool IsOpen() const { return reader_.IsOpen(); }
  inline bool Close() { return reader_.Close(); }
  


  // The default copy-constructor will do what we want: it will crash
  // for already-opened readers, by calling the member-variable copy-constructors.
 private:
  // Disallow assignment.
  RandomAccessTableReaderMapped &operator=(const RandomAccessTableReaderMapped<Holder>&);
  RandomAccessTableReader<Holder> reader_;
  RandomAccessTableReader<TokenHolder> token_reader_;
  std::string utt2spk_rxfilename_; // Used only in diagnostic messages.
};


/// @} end "addtogroup table_group"
} // end namespace kaldi

#include "kaldi-table-inl.h"

#endif  // KALDI_UTIL_KALDI_TABLE_H_