From 96a32415ab43377cf1575bd3f4f2980f58028209 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 14 Aug 2015 11:51:42 +0800 Subject: add implementation for kaldi io (by ymz) --- kaldi_io/src/kaldi/util/basic-filebuf.h | 1065 +++++++++++ kaldi_io/src/kaldi/util/common-utils.h | 31 + kaldi_io/src/kaldi/util/const-integer-set-inl.h | 88 + kaldi_io/src/kaldi/util/const-integer-set.h | 95 + kaldi_io/src/kaldi/util/edit-distance-inl.h | 189 ++ kaldi_io/src/kaldi/util/edit-distance.h | 63 + kaldi_io/src/kaldi/util/hash-list-inl.h | 183 ++ kaldi_io/src/kaldi/util/hash-list.h | 140 ++ kaldi_io/src/kaldi/util/kaldi-holder-inl.h | 800 ++++++++ kaldi_io/src/kaldi/util/kaldi-holder.h | 207 +++ kaldi_io/src/kaldi/util/kaldi-io-inl.h | 45 + kaldi_io/src/kaldi/util/kaldi-io.h | 264 +++ kaldi_io/src/kaldi/util/kaldi-pipebuf.h | 90 + kaldi_io/src/kaldi/util/kaldi-table-inl.h | 2246 +++++++++++++++++++++++ kaldi_io/src/kaldi/util/kaldi-table.h | 459 +++++ kaldi_io/src/kaldi/util/parse-options.h | 264 +++ kaldi_io/src/kaldi/util/simple-io-funcs.h | 56 + kaldi_io/src/kaldi/util/simple-options.h | 112 ++ kaldi_io/src/kaldi/util/stl-utils.h | 327 ++++ kaldi_io/src/kaldi/util/table-types.h | 137 ++ kaldi_io/src/kaldi/util/text-utils.h | 169 ++ kaldi_io/src/kaldi/util/timer.h | 27 + 22 files changed, 7057 insertions(+) create mode 100644 kaldi_io/src/kaldi/util/basic-filebuf.h create mode 100644 kaldi_io/src/kaldi/util/common-utils.h create mode 100644 kaldi_io/src/kaldi/util/const-integer-set-inl.h create mode 100644 kaldi_io/src/kaldi/util/const-integer-set.h create mode 100644 kaldi_io/src/kaldi/util/edit-distance-inl.h create mode 100644 kaldi_io/src/kaldi/util/edit-distance.h create mode 100644 kaldi_io/src/kaldi/util/hash-list-inl.h create mode 100644 kaldi_io/src/kaldi/util/hash-list.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-holder-inl.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-holder.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-io-inl.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-io.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-pipebuf.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-table-inl.h create mode 100644 kaldi_io/src/kaldi/util/kaldi-table.h create mode 100644 kaldi_io/src/kaldi/util/parse-options.h create mode 100644 kaldi_io/src/kaldi/util/simple-io-funcs.h create mode 100644 kaldi_io/src/kaldi/util/simple-options.h create mode 100644 kaldi_io/src/kaldi/util/stl-utils.h create mode 100644 kaldi_io/src/kaldi/util/table-types.h create mode 100644 kaldi_io/src/kaldi/util/text-utils.h create mode 100644 kaldi_io/src/kaldi/util/timer.h (limited to 'kaldi_io/src/kaldi/util') diff --git a/kaldi_io/src/kaldi/util/basic-filebuf.h b/kaldi_io/src/kaldi/util/basic-filebuf.h new file mode 100644 index 0000000..cf2e079 --- /dev/null +++ b/kaldi_io/src/kaldi/util/basic-filebuf.h @@ -0,0 +1,1065 @@ +/////////////////////////////////////////////////////////////////////////////// +// This is a modified version of the std::basic_filebuf from libc++ +// (http://libcxx.llvm.org/). +// It allows one to create basic_filebuf from an existing FILE* handle or file +// descriptor. +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source License licenses. See LICENSE.TXT for details (included at the +// bottom). +/////////////////////////////////////////////////////////////////////////////// +#ifndef KALDI_UTIL_BASIC_FILEBUF_H_ +#define KALDI_UTIL_BASIC_FILEBUF_H_ + +/////////////////////////////////////////////////////////////////////////////// +#include +#include +#include + +/////////////////////////////////////////////////////////////////////////////// +namespace kaldi +{ + +/////////////////////////////////////////////////////////////////////////////// +template > +class basic_filebuf : public std::basic_streambuf +{ +public: + typedef CharT char_type; + typedef Traits traits_type; + typedef typename traits_type::int_type int_type; + typedef typename traits_type::pos_type pos_type; + typedef typename traits_type::off_type off_type; + typedef typename traits_type::state_type state_type; + + basic_filebuf(); + basic_filebuf(basic_filebuf&& rhs); + virtual ~basic_filebuf(); + + basic_filebuf& operator=(basic_filebuf&& rhs); + void swap(basic_filebuf& rhs); + + bool is_open() const; + basic_filebuf* open(const char* s, std::ios_base::openmode mode); + basic_filebuf* open(const std::string& s, std::ios_base::openmode mode); + basic_filebuf* open(int fd, std::ios_base::openmode mode); + basic_filebuf* open(FILE* f, std::ios_base::openmode mode); + basic_filebuf* close(); + + FILE* file() { return this->_M_file; } + int fd() { return fileno(this->_M_file); } + +protected: + int_type underflow() override; + int_type pbackfail(int_type c = traits_type::eof()) override; + int_type overflow (int_type c = traits_type::eof()) override; + std::basic_streambuf* setbuf(char_type* s, std::streamsize n) override; + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode wch = std::ios_base::in | std::ios_base::out) override; + pos_type seekpos(pos_type sp, + std::ios_base::openmode wch = std::ios_base::in | std::ios_base::out) override; + int sync() override; + void imbue(const std::locale& loc) override; + +protected: + char* _M_extbuf; + const char* _M_extbufnext; + const char* _M_extbufend; + char _M_extbuf_min[8]; + size_t _M_ebs; + char_type* _M_intbuf; + size_t _M_ibs; + FILE* _M_file; + const std::codecvt* _M_cv; + state_type _M_st; + state_type _M_st_last; + std::ios_base::openmode _M_om; + std::ios_base::openmode _M_cm; + bool _M_owns_eb; + bool _M_owns_ib; + bool _M_always_noconv; + + const char* _M_get_mode(std::ios_base::openmode mode); + bool _M_read_mode(); + void _M_write_mode(); +}; + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf::basic_filebuf() + : _M_extbuf(nullptr), + _M_extbufnext(nullptr), + _M_extbufend(nullptr), + _M_ebs(0), + _M_intbuf(nullptr), + _M_ibs(0), + _M_file(nullptr), + _M_cv(nullptr), + _M_st(), + _M_st_last(), + _M_om(std::ios_base::openmode(0)), + _M_cm(std::ios_base::openmode(0)), + _M_owns_eb(false), + _M_owns_ib(false), + _M_always_noconv(false) +{ + if (std::has_facet >(this->getloc())) + { + _M_cv = &std::use_facet >(this->getloc()); + _M_always_noconv = _M_cv->always_noconv(); + } + setbuf(0, 4096); +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf::basic_filebuf(basic_filebuf&& rhs) + : std::basic_streambuf(rhs) +{ + if (rhs._M_extbuf == rhs._M_extbuf_min) + { + _M_extbuf = _M_extbuf_min; + _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf); + _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf); + } + else + { + _M_extbuf = rhs._M_extbuf; + _M_extbufnext = rhs._M_extbufnext; + _M_extbufend = rhs._M_extbufend; + } + _M_ebs = rhs._M_ebs; + _M_intbuf = rhs._M_intbuf; + _M_ibs = rhs._M_ibs; + _M_file = rhs._M_file; + _M_cv = rhs._M_cv; + _M_st = rhs._M_st; + _M_st_last = rhs._M_st_last; + _M_om = rhs._M_om; + _M_cm = rhs._M_cm; + _M_owns_eb = rhs._M_owns_eb; + _M_owns_ib = rhs._M_owns_ib; + _M_always_noconv = rhs._M_always_noconv; + if (rhs.pbase()) + { + if (rhs.pbase() == rhs._M_intbuf) + this->setp(_M_intbuf, _M_intbuf + (rhs. epptr() - rhs.pbase())); + else + this->setp((char_type*)_M_extbuf, + (char_type*)_M_extbuf + (rhs. epptr() - rhs.pbase())); + this->pbump(rhs. pptr() - rhs.pbase()); + } + else if (rhs.eback()) + { + if (rhs.eback() == rhs._M_intbuf) + this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()), + _M_intbuf + (rhs.egptr() - rhs.eback())); + else + this->setg((char_type*)_M_extbuf, + (char_type*)_M_extbuf + (rhs.gptr() - rhs.eback()), + (char_type*)_M_extbuf + (rhs.egptr() - rhs.eback())); + } + rhs._M_extbuf = nullptr; + rhs._M_extbufnext = nullptr; + rhs._M_extbufend = nullptr; + rhs._M_ebs = 0; + rhs._M_intbuf = nullptr; + rhs._M_ibs = 0; + rhs._M_file = nullptr; + rhs._M_st = state_type(); + rhs._M_st_last = state_type(); + rhs._M_om = std::ios_base::openmode(0); + rhs._M_cm = std::ios_base::openmode(0); + rhs._M_owns_eb = false; + rhs._M_owns_ib = false; + rhs.setg(0, 0, 0); + rhs.setp(0, 0); +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +basic_filebuf& +basic_filebuf::operator=(basic_filebuf&& rhs) +{ + close(); + swap(rhs); + return *this; +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf::~basic_filebuf() +{ + // try + // { + // close(); + // } + // catch (...) + // { + // } + if (_M_owns_eb) + delete [] _M_extbuf; + if (_M_owns_ib) + delete [] _M_intbuf; +} + +/////////////////////////////////////////////////////////////////////////////// +template +void +basic_filebuf::swap(basic_filebuf& rhs) +{ + std::basic_streambuf::swap(rhs); + if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) + { + std::swap(_M_extbuf, rhs._M_extbuf); + std::swap(_M_extbufnext, rhs._M_extbufnext); + std::swap(_M_extbufend, rhs._M_extbufend); + } + else + { + ptrdiff_t ln = _M_extbufnext - _M_extbuf; + ptrdiff_t le = _M_extbufend - _M_extbuf; + ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf; + ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf; + if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) + { + _M_extbuf = rhs._M_extbuf; + rhs._M_extbuf = rhs._M_extbuf_min; + } + else if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf == rhs._M_extbuf_min) + { + rhs._M_extbuf = _M_extbuf; + _M_extbuf = _M_extbuf_min; + } + _M_extbufnext = _M_extbuf + rn; + _M_extbufend = _M_extbuf + re; + rhs._M_extbufnext = rhs._M_extbuf + ln; + rhs._M_extbufend = rhs._M_extbuf + le; + } + std::swap(_M_ebs, rhs._M_ebs); + std::swap(_M_intbuf, rhs._M_intbuf); + std::swap(_M_ibs, rhs._M_ibs); + std::swap(_M_file, rhs._M_file); + std::swap(_M_cv, rhs._M_cv); + std::swap(_M_st, rhs._M_st); + std::swap(_M_st_last, rhs._M_st_last); + std::swap(_M_om, rhs._M_om); + std::swap(_M_cm, rhs._M_cm); + std::swap(_M_owns_eb, rhs._M_owns_eb); + std::swap(_M_owns_ib, rhs._M_owns_ib); + std::swap(_M_always_noconv, rhs._M_always_noconv); + if (this->eback() == (char_type*)rhs._M_extbuf_min) + { + ptrdiff_t n = this->gptr() - this->eback(); + ptrdiff_t e = this->egptr() - this->eback(); + this->setg((char_type*)_M_extbuf_min, + (char_type*)_M_extbuf_min + n, + (char_type*)_M_extbuf_min + e); + } + else if (this->pbase() == (char_type*)rhs._M_extbuf_min) + { + ptrdiff_t n = this->pptr() - this->pbase(); + ptrdiff_t e = this->epptr() - this->pbase(); + this->setp((char_type*)_M_extbuf_min, + (char_type*)_M_extbuf_min + e); + this->pbump(n); + } + if (rhs.eback() == (char_type*)_M_extbuf_min) + { + ptrdiff_t n = rhs.gptr() - rhs.eback(); + ptrdiff_t e = rhs.egptr() - rhs.eback(); + rhs.setg((char_type*)rhs._M_extbuf_min, + (char_type*)rhs._M_extbuf_min + n, + (char_type*)rhs._M_extbuf_min + e); + } + else if (rhs.pbase() == (char_type*)_M_extbuf_min) + { + ptrdiff_t n = rhs.pptr() - rhs.pbase(); + ptrdiff_t e = rhs.epptr() - rhs.pbase(); + rhs.setp((char_type*)rhs._M_extbuf_min, + (char_type*)rhs._M_extbuf_min + e); + rhs.pbump(n); + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +void +swap(basic_filebuf& x, basic_filebuf& y) +{ + x.swap(y); +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +bool +basic_filebuf::is_open() const +{ + return _M_file != nullptr; +} + +/////////////////////////////////////////////////////////////////////////////// +template +const char* basic_filebuf::_M_get_mode(std::ios_base::openmode mode) +{ + switch ((mode & ~std::ios_base::ate) | 0) + { + case std::ios_base::out: + case std::ios_base::out | std::ios_base::trunc: + return "w"; + case std::ios_base::out | std::ios_base::app: + case std::ios_base::app: + return "a"; + break; + case std::ios_base::in: + return "r"; + case std::ios_base::in | std::ios_base::out: + return "r+"; + case std::ios_base::in | std::ios_base::out | std::ios_base::trunc: + return "w+"; + case std::ios_base::in | std::ios_base::out | std::ios_base::app: + case std::ios_base::in | std::ios_base::app: + return "a+"; + case std::ios_base::out | std::ios_base::binary: + case std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: + return "wb"; + case std::ios_base::out | std::ios_base::app | std::ios_base::binary: + case std::ios_base::app | std::ios_base::binary: + return "ab"; + case std::ios_base::in | std::ios_base::binary: + return "rb"; + case std::ios_base::in | std::ios_base::out | std::ios_base::binary: + return "r+b"; + case std::ios_base::in | std::ios_base::out | std::ios_base::trunc | std::ios_base::binary: + return "w+b"; + case std::ios_base::in | std::ios_base::out | std::ios_base::app | std::ios_base::binary: + case std::ios_base::in | std::ios_base::app | std::ios_base::binary: + return "a+b"; + default: + return nullptr; + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::open(const char* s, std::ios_base::openmode mode) +{ + basic_filebuf* rt = nullptr; + if (_M_file == nullptr) + { + const char* md= _M_get_mode(mode); + if (md) + { + _M_file = fopen(s, md); + if (_M_file) + { + rt = this; + _M_om = mode; + if (mode & std::ios_base::ate) + { + if (fseek(_M_file, 0, SEEK_END)) + { + fclose(_M_file); + _M_file = nullptr; + rt = nullptr; + } + } + } + } + } + return rt; +} + +/////////////////////////////////////////////////////////////////////////////// +template +inline +basic_filebuf* +basic_filebuf::open(const std::string& s, std::ios_base::openmode mode) +{ + return open(s.c_str(), mode); +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::open(int fd, std::ios_base::openmode mode) +{ + const char* md= this->_M_get_mode(mode); + if (md) + { + this->_M_file= fdopen(fd, md); + this->_M_om = mode; + return this; + } + else return nullptr; +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::open(FILE* f, std::ios_base::openmode mode) +{ + this->_M_file = f; + this->_M_om = mode; + return this; +} + +/////////////////////////////////////////////////////////////////////////////// +template +basic_filebuf* +basic_filebuf::close() +{ + basic_filebuf* rt = nullptr; + if (_M_file) + { + rt = this; + std::unique_ptr h(_M_file, fclose); + if (sync()) + rt = nullptr; + if (fclose(h.release()) == 0) + _M_file = nullptr; + else + rt = nullptr; + } + return rt; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::int_type +basic_filebuf::underflow() +{ + if (_M_file == nullptr) + return traits_type::eof(); + bool initial = _M_read_mode(); + char_type buf; + if (this->gptr() == nullptr) + this->setg(&buf, &buf+1, &buf+1); + const size_t unget_sz = initial ? 0 : std::min((this->egptr() - this->eback()) / 2, 4); + int_type c = traits_type::eof(); + if (this->gptr() == this->egptr()) + { + memmove(this->eback(), this->egptr() - unget_sz, unget_sz * sizeof(char_type)); + if (_M_always_noconv) + { + size_t nmemb = static_cast(this->egptr() - this->eback() - unget_sz); + nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file); + if (nmemb != 0) + { + this->setg(this->eback(), + this->eback() + unget_sz, + this->eback() + unget_sz + nmemb); + c = traits_type::to_int_type(*this->gptr()); + } + } + else + { + memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext); + _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext); + _M_extbufend = _M_extbuf + (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs); + size_t nmemb = std::min(static_cast(_M_ibs - unget_sz), + static_cast(_M_extbufend - _M_extbufnext)); + std::codecvt_base::result r; + _M_st_last = _M_st; + size_t nr = fread((void*)_M_extbufnext, 1, nmemb, _M_file); + if (nr != 0) + { + if (!_M_cv) + throw std::bad_cast(); + _M_extbufend = _M_extbufnext + nr; + char_type* inext; + r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext, + this->eback() + unget_sz, + this->eback() + _M_ibs, inext); + if (r == std::codecvt_base::noconv) + { + this->setg((char_type*)_M_extbuf, (char_type*)_M_extbuf, (char_type*)_M_extbufend); + c = traits_type::to_int_type(*this->gptr()); + } + else if (inext != this->eback() + unget_sz) + { + this->setg(this->eback(), this->eback() + unget_sz, inext); + c = traits_type::to_int_type(*this->gptr()); + } + } + } + } + else + c = traits_type::to_int_type(*this->gptr()); + if (this->eback() == &buf) + this->setg(0, 0, 0); + return c; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::int_type +basic_filebuf::pbackfail(int_type c) +{ + if (_M_file && this->eback() < this->gptr()) + { + if (traits_type::eq_int_type(c, traits_type::eof())) + { + this->gbump(-1); + return traits_type::not_eof(c); + } + if ((_M_om & std::ios_base::out) || + traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) + { + this->gbump(-1); + *this->gptr() = traits_type::to_char_type(c); + return c; + } + } + return traits_type::eof(); +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::int_type +basic_filebuf::overflow(int_type c) +{ + if (_M_file == nullptr) + return traits_type::eof(); + _M_write_mode(); + char_type buf; + char_type* pb_save = this->pbase(); + char_type* epb_save = this->epptr(); + if (!traits_type::eq_int_type(c, traits_type::eof())) + { + if (this->pptr() == nullptr) + this->setp(&buf, &buf+1); + *this->pptr() = traits_type::to_char_type(c); + this->pbump(1); + } + if (this->pptr() != this->pbase()) + { + if (_M_always_noconv) + { + size_t nmemb = static_cast(this->pptr() - this->pbase()); + if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb) + return traits_type::eof(); + } + else + { + char* extbe = _M_extbuf; + std::codecvt_base::result r; + do + { + if (!_M_cv) + throw std::bad_cast(); + const char_type* e; + r = _M_cv->out(_M_st, this->pbase(), this->pptr(), e, + _M_extbuf, _M_extbuf + _M_ebs, extbe); + if (e == this->pbase()) + return traits_type::eof(); + if (r == std::codecvt_base::noconv) + { + size_t nmemb = static_cast(this->pptr() - this->pbase()); + if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb) + return traits_type::eof(); + } + else if (r == std::codecvt_base::ok || r == std::codecvt_base::partial) + { + size_t nmemb = static_cast(extbe - _M_extbuf); + if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) + return traits_type::eof(); + if (r == std::codecvt_base::partial) + { + this->setp((char_type*)e, this->pptr()); + this->pbump(this->epptr() - this->pbase()); + } + } + else + return traits_type::eof(); + } while (r == std::codecvt_base::partial); + } + this->setp(pb_save, epb_save); + } + return traits_type::not_eof(c); +} + +/////////////////////////////////////////////////////////////////////////////// +template +std::basic_streambuf* +basic_filebuf::setbuf(char_type* s, std::streamsize n) +{ + this->setg(0, 0, 0); + this->setp(0, 0); + if (_M_owns_eb) + delete [] _M_extbuf; + if (_M_owns_ib) + delete [] _M_intbuf; + _M_ebs = n; + if (_M_ebs > sizeof(_M_extbuf_min)) + { + if (_M_always_noconv && s) + { + _M_extbuf = (char*)s; + _M_owns_eb = false; + } + else + { + _M_extbuf = new char[_M_ebs]; + _M_owns_eb = true; + } + } + else + { + _M_extbuf = _M_extbuf_min; + _M_ebs = sizeof(_M_extbuf_min); + _M_owns_eb = false; + } + if (!_M_always_noconv) + { + _M_ibs = std::max(n, sizeof(_M_extbuf_min)); + if (s && _M_ibs >= sizeof(_M_extbuf_min)) + { + _M_intbuf = s; + _M_owns_ib = false; + } + else + { + _M_intbuf = new char_type[_M_ibs]; + _M_owns_ib = true; + } + } + else + { + _M_ibs = 0; + _M_intbuf = 0; + _M_owns_ib = false; + } + return this; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::pos_type +basic_filebuf::seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode) +{ + if (!_M_cv) + throw std::bad_cast(); + int width = _M_cv->encoding(); + if (_M_file == nullptr || (width <= 0 && off != 0) || sync()) + return pos_type(off_type(-1)); + // width > 0 || off == 0 + int whence; + switch (way) + { + case std::ios_base::beg: + whence = SEEK_SET; + break; + case std::ios_base::cur: + whence = SEEK_CUR; + break; + case std::ios_base::end: + whence = SEEK_END; + break; + default: + return pos_type(off_type(-1)); + } +#if _WIN32 + if (fseek(_M_file, width > 0 ? width * off : 0, whence)) + return pos_type(off_type(-1)); + pos_type r = ftell(_M_file); +#else + if (fseeko(_M_file, width > 0 ? width * off : 0, whence)) + return pos_type(off_type(-1)); + pos_type r = ftello(_M_file); +#endif + r.state(_M_st); + return r; +} + +/////////////////////////////////////////////////////////////////////////////// +template +typename basic_filebuf::pos_type +basic_filebuf::seekpos(pos_type sp, std::ios_base::openmode) +{ + if (_M_file == nullptr || sync()) + return pos_type(off_type(-1)); +#if _WIN32 + if (fseek(_M_file, sp, SEEK_SET)) + return pos_type(off_type(-1)); +#else + if (fseeko(_M_file, sp, SEEK_SET)) + return pos_type(off_type(-1)); +#endif + _M_st = sp.state(); + return sp; +} + +/////////////////////////////////////////////////////////////////////////////// +template +int +basic_filebuf::sync() +{ + if (_M_file == nullptr) + return 0; + if (!_M_cv) + throw std::bad_cast(); + if (_M_cm & std::ios_base::out) + { + if (this->pptr() != this->pbase()) + if (overflow() == traits_type::eof()) + return -1; + std::codecvt_base::result r; + do + { + char* extbe; + r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe); + size_t nmemb = static_cast(extbe - _M_extbuf); + if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb) + return -1; + } while (r == std::codecvt_base::partial); + if (r == std::codecvt_base::error) + return -1; + if (fflush(_M_file)) + return -1; + } + else if (_M_cm & std::ios_base::in) + { + off_type c; + state_type state = _M_st_last; + bool update_st = false; + if (_M_always_noconv) + c = this->egptr() - this->gptr(); + else + { + int width = _M_cv->encoding(); + c = _M_extbufend - _M_extbufnext; + if (width > 0) + c += width * (this->egptr() - this->gptr()); + else + { + if (this->gptr() != this->egptr()) + { + const int off = _M_cv->length(state, _M_extbuf, + _M_extbufnext, + this->gptr() - this->eback()); + c += _M_extbufnext - _M_extbuf - off; + update_st = true; + } + } + } +#if _WIN32 + if (fseek(_M_file_, -c, SEEK_CUR)) + return -1; +#else + if (fseeko(_M_file, -c, SEEK_CUR)) + return -1; +#endif + if (update_st) + _M_st = state; + _M_extbufnext = _M_extbufend = _M_extbuf; + this->setg(0, 0, 0); + _M_cm = std::ios_base::openmode(0); + } + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +template +void +basic_filebuf::imbue(const std::locale& loc) +{ + sync(); + _M_cv = &std::use_facet >(loc); + bool old_anc = _M_always_noconv; + _M_always_noconv = _M_cv->always_noconv(); + if (old_anc != _M_always_noconv) + { + this->setg(0, 0, 0); + this->setp(0, 0); + // invariant, char_type is char, else we couldn't get here + if (_M_always_noconv) // need to dump _M_intbuf + { + if (_M_owns_eb) + delete [] _M_extbuf; + _M_owns_eb = _M_owns_ib; + _M_ebs = _M_ibs; + _M_extbuf = (char*)_M_intbuf; + _M_ibs = 0; + _M_intbuf = nullptr; + _M_owns_ib = false; + } + else // need to obtain an _M_intbuf. + { // If _M_extbuf is user-supplied, use it, else new _M_intbuf + if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) + { + _M_ibs = _M_ebs; + _M_intbuf = (char_type*)_M_extbuf; + _M_owns_ib = false; + _M_extbuf = new char[_M_ebs]; + _M_owns_eb = true; + } + else + { + _M_ibs = _M_ebs; + _M_intbuf = new char_type[_M_ibs]; + _M_owns_ib = true; + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +template +bool +basic_filebuf::_M_read_mode() +{ + if (!(_M_cm & std::ios_base::in)) + { + this->setp(0, 0); + if (_M_always_noconv) + this->setg((char_type*)_M_extbuf, + (char_type*)_M_extbuf + _M_ebs, + (char_type*)_M_extbuf + _M_ebs); + else + this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs); + _M_cm = std::ios_base::in; + return true; + } + return false; +} + +/////////////////////////////////////////////////////////////////////////////// +template +void +basic_filebuf::_M_write_mode() +{ + if (!(_M_cm & std::ios_base::out)) + { + this->setg(0, 0, 0); + if (_M_ebs > sizeof(_M_extbuf_min)) + { + if (_M_always_noconv) + this->setp((char_type*)_M_extbuf, + (char_type*)_M_extbuf + (_M_ebs - 1)); + else + this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1)); + } + else + this->setp(0, 0); + _M_cm = std::ios_base::out; + } +} + +/////////////////////////////////////////////////////////////////////////////// +} + +/////////////////////////////////////////////////////////////////////////////// +#endif // KALDI_UTIL_BASIC_FILEBUF_H_ + +/////////////////////////////////////////////////////////////////////////////// + +/* + * ============================================================================ + * libc++ License + * ============================================================================ + * + * The libc++ library is dual licensed under both the University of Illinois + * "BSD-Like" license and the MIT license. As a user of this code you may + * choose to use it under either license. As a contributor, you agree to allow + * your code to be used under both. + * + * Full text of the relevant licenses is included below. + * + * ============================================================================ + * + * University of Illinois/NCSA + * Open Source License + * + * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included below) + * + * All rights reserved. + * + * Developed by: + * + * LLVM Team + * + * University of Illinois at Urbana-Champaign + * + * http://llvm.org + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal with + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimers in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the names of the LLVM Team, University of Illinois at + * Urbana-Champaign, nor the names of its contributors may be used to + * endorse or promote products derived from this Software without specific + * prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + * SOFTWARE. + * + * ============================================================================== + * + * Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT (included below) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * ============================================================================== + * + * This file is a partial list of people who have contributed to the LLVM/libc++ + * project. If you have contributed a patch or made some other contribution to + * LLVM/libc++, please submit a patch to this file to add yourself, and it will be + * done! + * + * The list is sorted by surname and formatted to allow easy grepping and + * beautification by scripts. The fields are: name (N), email (E), web-address + * (W), PGP key ID and fingerprint (P), description (D), and snail-mail address + * (S). + * + * N: Saleem Abdulrasool + * E: compnerd@compnerd.org + * D: Minor patches and Linux fixes. + * + * N: Dimitry Andric + * E: dimitry@andric.com + * D: Visibility fixes, minor FreeBSD portability patches. + * + * N: Holger Arnold + * E: holgerar@gmail.com + * D: Minor fix. + * + * N: Ruben Van Boxem + * E: vanboxem dot ruben at gmail dot com + * D: Initial Windows patches. + * + * N: David Chisnall + * E: theraven at theravensnest dot org + * D: FreeBSD and Solaris ports, libcxxrt support, some atomics work. + * + * N: Marshall Clow + * E: mclow.lists@gmail.com + * E: marshall@idio.com + * D: C++14 support, patches and bug fixes. + * + * N: Bill Fisher + * E: william.w.fisher@gmail.com + * D: Regex bug fixes. + * + * N: Matthew Dempsky + * E: matthew@dempsky.org + * D: Minor patches and bug fixes. + * + * N: Google Inc. + * D: Copyright owner and contributor of the CityHash algorithm + * + * N: Howard Hinnant + * E: hhinnant@apple.com + * D: Architect and primary author of libc++ + * + * N: Hyeon-bin Jeong + * E: tuhertz@gmail.com + * D: Minor patches and bug fixes. + * + * N: Argyrios Kyrtzidis + * E: kyrtzidis@apple.com + * D: Bug fixes. + * + * N: Bruce Mitchener, Jr. + * E: bruce.mitchener@gmail.com + * D: Emscripten-related changes. + * + * N: Michel Morin + * E: mimomorin@gmail.com + * D: Minor patches to is_convertible. + * + * N: Andrew Morrow + * E: andrew.c.morrow@gmail.com + * D: Minor patches and Linux fixes. + * + * N: Arvid Picciani + * E: aep at exys dot org + * D: Minor patches and musl port. + * + * N: Bjorn Reese + * E: breese@users.sourceforge.net + * D: Initial regex prototype + * + * N: Nico Rieck + * E: nico.rieck@gmail.com + * D: Windows fixes + * + * N: Jonathan Sauer + * D: Minor patches, mostly related to constexpr + * + * N: Craig Silverstein + * E: csilvers@google.com + * D: Implemented Cityhash as the string hash function on 64-bit machines + * + * N: Richard Smith + * D: Minor patches. + * + * N: Joerg Sonnenberger + * E: joerg@NetBSD.org + * D: NetBSD port. + * + * N: Stephan Tolksdorf + * E: st@quanttec.com + * D: Minor fix + * + * N: Michael van der Westhuizen + * E: r1mikey at gmail dot com + * + * N: Klaas de Vries + * E: klaas at klaasgaaf dot nl + * D: Minor bug fix. + * + * N: Zhang Xiongpang + * E: zhangxiongpang@gmail.com + * D: Minor patches and bug fixes. + * + * N: Xing Xue + * E: xingxue@ca.ibm.com + * D: AIX port + * + * N: Zhihao Yuan + * E: lichray@gmail.com + * D: Standard compatibility fixes. + * + * N: Jeffrey Yasskin + * E: jyasskin@gmail.com + * E: jyasskin@google.com + * D: Linux fixes. + */ diff --git a/kaldi_io/src/kaldi/util/common-utils.h b/kaldi_io/src/kaldi/util/common-utils.h new file mode 100644 index 0000000..9d39f9d --- /dev/null +++ b/kaldi_io/src/kaldi/util/common-utils.h @@ -0,0 +1,31 @@ +// util/common-utils.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_COMMON_UTILS_H_ +#define KALDI_UTIL_COMMON_UTILS_H_ + +#include "base/kaldi-common.h" +#include "util/parse-options.h" +#include "util/kaldi-io.h" +#include "util/simple-io-funcs.h" +#include "util/kaldi-holder.h" +#include "util/kaldi-table.h" +#include "util/table-types.h" +#include "util/text-utils.h" + +#endif diff --git a/kaldi_io/src/kaldi/util/const-integer-set-inl.h b/kaldi_io/src/kaldi/util/const-integer-set-inl.h new file mode 100644 index 0000000..8f92ab2 --- /dev/null +++ b/kaldi_io/src/kaldi/util/const-integer-set-inl.h @@ -0,0 +1,88 @@ +// util/const-integer-set-inl.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_CONST_INTEGER_SET_INL_H_ +#define KALDI_UTIL_CONST_INTEGER_SET_INL_H_ + +// Do not include this file directly. It is included by const-integer-set.h + + +namespace kaldi { + +template +void ConstIntegerSet::InitInternal() { + KALDI_ASSERT_IS_INTEGER_TYPE(I); + quick_set_.clear(); // just in case we previously had data. + if (slow_set_.size() == 0) { + lowest_member_=(I) 1; + highest_member_=(I) 0; + contiguous_ = false; + quick_ = false; + } else { + lowest_member_ = slow_set_.front(); + highest_member_ = slow_set_.back(); + size_t range = highest_member_ + 1 - lowest_member_; + if (range == slow_set_.size()) { + contiguous_ = true; + quick_=false; + } else { + contiguous_ = false; + if (range < slow_set_.size() * 8 * sizeof(I)) { // If it would be more compact to store as bool + // (assuming 1 bit per element)... + quick_set_.resize(range, false); + for (size_t i = 0;i < slow_set_.size();i++) + quick_set_[slow_set_[i] - lowest_member_] = true; + quick_ = true; + } else { + quick_ = false; + } + } + } +} + +template +int ConstIntegerSet::count(I i) const { + if (i < lowest_member_ || i > highest_member_) return 0; + else { + if (contiguous_) return true; + if (quick_) return (quick_set_[i-lowest_member_] ? 1 : 0); + else { + bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i); + return (ans ? 1 : 0); + } + } +} + +template +void ConstIntegerSet::Write(std::ostream &os, bool binary) const { + WriteIntegerVector(os, binary, slow_set_); +} + +template +void ConstIntegerSet::Read(std::istream &is, bool binary) { + ReadIntegerVector(is, binary, &slow_set_); + InitInternal(); +} + + + +} // end namespace kaldi + +#endif diff --git a/kaldi_io/src/kaldi/util/const-integer-set.h b/kaldi_io/src/kaldi/util/const-integer-set.h new file mode 100644 index 0000000..ffdce4d --- /dev/null +++ b/kaldi_io/src/kaldi/util/const-integer-set.h @@ -0,0 +1,95 @@ +// util/const-integer-set.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_CONST_INTEGER_SET_H_ +#define KALDI_UTIL_CONST_INTEGER_SET_H_ +#include +#include +#include +#include +#include +#include "util/stl-utils.h" + + /* ConstIntegerSet is a way to efficiently test whether something is in a + supplied set of integers. It can be initialized from a vector or set, but + never changed after that. It either uses a sorted vector or an array of + bool, depending on the input. It behaves like a const version of an STL set, with + only a subset of the functionality, except all the member functions are + upper-case. + + Note that we could get rid of the member slow_set_, but we'd have to + do more work to implement an iterator type. This would save memory. + */ + +namespace kaldi { + +template class ConstIntegerSet { + public: + ConstIntegerSet(): lowest_member_(1), highest_member_(0) { } + + void Init(const std::vector &input) { + slow_set_ = input; + SortAndUniq(&slow_set_); + InitInternal(); + } + + void Init(const std::set &input) { + CopySetToVector(input, &slow_set_); + InitInternal(); + } + + explicit ConstIntegerSet(const std::vector &input): slow_set_(input) { + SortAndUniq(&slow_set_); + InitInternal(); + } + explicit ConstIntegerSet(const std::set &input) { + CopySetToVector(input, &slow_set_); + InitInternal(); + } + explicit ConstIntegerSet(const ConstIntegerSet &other): slow_set_(other.slow_set_) { + InitInternal(); + } + + int count(I i) const; // returns 1 or 0. + + typedef typename std::vector::const_iterator iterator; + iterator begin() const { return slow_set_.begin(); } + iterator end() const { return slow_set_.end(); } + size_t size() const { return slow_set_.size(); } + bool empty() const { return slow_set_.empty(); } + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); + + private: + I lowest_member_; + I highest_member_; + bool contiguous_; + bool quick_; + std::vector quick_set_; + std::vector slow_set_; + void InitInternal(); +}; + +} // end namespace kaldi + +#include "const-integer-set-inl.h" + +#endif diff --git a/kaldi_io/src/kaldi/util/edit-distance-inl.h b/kaldi_io/src/kaldi/util/edit-distance-inl.h new file mode 100644 index 0000000..ebbfb71 --- /dev/null +++ b/kaldi_io/src/kaldi/util/edit-distance-inl.h @@ -0,0 +1,189 @@ +// util/edit-distance-inl.h + +// Copyright 2009-2011 Microsoft Corporation; Haihua Xu; Yanmin Qian + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_EDIT_DISTANCE_INL_H_ +#define KALDI_UTIL_EDIT_DISTANCE_INL_H_ +#include "util/stl-utils.h" + + +namespace kaldi { + +template +int32 LevenshteinEditDistance(const std::vector &a, + const std::vector &b) { + // Algorithm: + // write A and B for the sequences, with elements a_0 .. + // let |A| = M and |B| = N be the lengths, and have + // elements a_0 ... a_{M-1} and b_0 ... b_{N-1}. + // We are computing the recursion + // E(m, n) = min( E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})), + // E(m-1, n), + // E(m, n-1) ). + // where E(m, n) is defined for m = 0..M and n = 0..N and out-of- + // bounds quantities are considered to be infinity (i.e. the + // recursion does not visit them). + + // We do this computation using a vector e of size N+1. + // The outer iterations range over m = 0..M. + + int M = a.size(), N = b.size(); + std::vector e(N+1); + std::vector e_tmp(N+1); + // initialize e. + for (size_t i = 0; i < e.size(); i++) + e[i] = i; + for (int32 m = 1; m <= M; m++) { + // computing E(m, .) from E(m-1, .) + // handle special case n = 0: + e_tmp[0] = e[0] + 1; + + for (int32 n = 1; n <= N; n++) { + int32 term1 = e[n-1] + (a[m-1] == b[n-1] ? 0 : 1); + int32 term2 = e[n] + 1; + int32 term3 = e_tmp[n-1] + 1; + e_tmp[n] = std::min(term1, std::min(term2, term3)); + } + e = e_tmp; + } + return e.back(); +} +// +struct error_stats{ + int32 ins_num; + int32 del_num; + int32 sub_num; + int32 total_cost; // minimum total cost to the current alignment. +}; +// Note that both hyp and ref should not contain noise word in +// the following implementation. + +template +int32 LevenshteinEditDistance(const std::vector &ref, + const std::vector &hyp, + int32 *ins, int32 *del, int32 *sub) { + // temp sequence to remember error type and stats. + std::vector e(ref.size()+1); + std::vector cur_e(ref.size()+1); + // initialize the first hypothesis aligned to the reference at each + // position:[hyp_index =0][ref_index] + for (size_t i =0; i < e.size(); i ++) { + e[i].ins_num = 0; + e[i].sub_num = 0; + e[i].del_num = i; + e[i].total_cost = i; + } + + // for other alignments + for (size_t hyp_index = 1; hyp_index <= hyp.size(); hyp_index ++) { + cur_e[0] = e[0]; + cur_e[0].ins_num ++; + cur_e[0].total_cost ++; + for (size_t ref_index = 1; ref_index <= ref.size(); ref_index ++) { + + int32 ins_err = e[ref_index].total_cost + 1; + int32 del_err = cur_e[ref_index-1].total_cost + 1; + int32 sub_err = e[ref_index-1].total_cost; + if (hyp[hyp_index-1] != ref[ref_index-1]) + sub_err ++; + + if (sub_err < ins_err && sub_err < del_err) { + cur_e[ref_index] =e[ref_index-1]; + if (hyp[hyp_index-1] != ref[ref_index-1]) + cur_e[ref_index].sub_num ++; // substitution error should be increased + cur_e[ref_index].total_cost = sub_err; + }else if (del_err < ins_err ) { + cur_e[ref_index] = cur_e[ref_index-1]; + cur_e[ref_index].total_cost = del_err; + cur_e[ref_index].del_num ++; // deletion number is increased. + }else{ + cur_e[ref_index] = e[ref_index]; + cur_e[ref_index].total_cost = ins_err; + cur_e[ref_index].ins_num ++; // insertion number is increased. + } + } + e = cur_e; // alternate for the next recursion. + } + size_t ref_index = e.size()-1; + *ins = e[ref_index].ins_num, *del = e[ref_index].del_num, *sub = e[ref_index].sub_num; + return e[ref_index].total_cost; +} + +template +int32 LevenshteinAlignment(const std::vector &a, + const std::vector &b, + T eps_symbol, + std::vector > *output) { + // Check inputs: + { + KALDI_ASSERT(output != NULL); + for (size_t i = 0; i < a.size(); i++) KALDI_ASSERT(a[i] != eps_symbol); + for (size_t i = 0; i < b.size(); i++) KALDI_ASSERT(b[i] != eps_symbol); + } + output->clear(); + // This is very memory-inefficiently implemented using a vector of vectors. + size_t M = a.size(), N = b.size(); + size_t m, n; + std::vector > e(M+1); + for (m = 0; m <=M; m++) e[m].resize(N+1); + for (n = 0; n <= N; n++) + e[0][n] = n; + for (m = 1; m <= M; m++) { + e[m][0] = e[m-1][0] + 1; + for (n = 1; n <= N; n++) { + int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1); + int32 del = e[m-1][n] + 1; // assumes a == ref, b == hyp. + int32 ins = e[m][n-1] + 1; + e[m][n] = std::min(sub_or_ok, std::min(del, ins)); + } + } + // get time-reversed output first: trace back. + m = M; n = N; + while (m != 0 || n != 0) { + size_t last_m, last_n; + if (m == 0) { last_m = m; last_n = n-1; } + else if (n == 0) { last_m = m-1; last_n = n; } + else { + int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1); + int32 del = e[m-1][n] + 1; // assumes a == ref, b == hyp. + int32 ins = e[m][n-1] + 1; + if (sub_or_ok <= std::min(del, ins)) { // choose sub_or_ok if all else equal. + last_m = m-1; last_n = n-1; + } else { + if (del <= ins) { // choose del over ins if equal. + last_m = m-1; last_n = n; + } else { + last_m = m; last_n = n-1; + } + } + } + T a_sym, b_sym; + a_sym = (last_m == m ? eps_symbol : a[last_m]); + b_sym = (last_n == n ? eps_symbol : b[last_n]); + output->push_back(std::make_pair(a_sym, b_sym)); + m = last_m; + n = last_n; + } + ReverseVector(output); + return e[M][N]; +} + + +} // end namespace kaldi + +#endif // KALDI_UTIL_EDIT_DISTANCE_INL_H_ diff --git a/kaldi_io/src/kaldi/util/edit-distance.h b/kaldi_io/src/kaldi/util/edit-distance.h new file mode 100644 index 0000000..6000622 --- /dev/null +++ b/kaldi_io/src/kaldi/util/edit-distance.h @@ -0,0 +1,63 @@ +// util/edit-distance.h + +// Copyright 2009-2011 Microsoft Corporation; Haihua Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_EDIT_DISTANCE_H_ +#define KALDI_UTIL_EDIT_DISTANCE_H_ +#include +#include +#include +#include +#include +#include "base/kaldi-types.h" + +namespace kaldi { + +// Compute the edit-distance between two strings. +template +int32 LevenshteinEditDistance(const std::vector &a, + const std::vector &b); + + +// edit distance calculation with conventional method. +// note: noise word must be filtered out from the hypothesis and reference sequence +// before the following procedure conducted. +template +int32 LevenshteinEditDistance(const std::vector &ref, + const std::vector &hyp, + int32 *ins, int32 *del, int32 *sub); + +// This version of the edit-distance computation outputs the alignment +// between the two. This is a vector of pairs of (symbol a, symbol b). +// The epsilon symbol (eps_symbol) must not occur in sequences a or b. +// Where one aligned to no symbol in the other (insertion or deletion), +// epsilon will be the corresponding member of the pair. +// It returns the edit-distance between the two strings. + +template +int32 LevenshteinAlignment(const std::vector &a, + const std::vector &b, + T eps_symbol, + std::vector > *output); + +} // end namespace kaldi + +#include "edit-distance-inl.h" + +#endif diff --git a/kaldi_io/src/kaldi/util/hash-list-inl.h b/kaldi_io/src/kaldi/util/hash-list-inl.h new file mode 100644 index 0000000..19c2bb6 --- /dev/null +++ b/kaldi_io/src/kaldi/util/hash-list-inl.h @@ -0,0 +1,183 @@ +// util/hash-list-inl.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_HASH_LIST_INL_H_ +#define KALDI_UTIL_HASH_LIST_INL_H_ + +// Do not include this file directly. It is included by fast-hash.h + + +namespace kaldi { + +template HashList::HashList() { + list_head_ = NULL; + bucket_list_tail_ = static_cast(-1); // invalid. + hash_size_ = 0; + freed_head_ = NULL; +} + +template void HashList::SetSize(size_t size) { + hash_size_ = size; + KALDI_ASSERT(list_head_ == NULL && bucket_list_tail_ == static_cast(-1)); // make sure empty. + if (size > buckets_.size()) + buckets_.resize(size, HashBucket(0, NULL)); +} + +template +typename HashList::Elem* HashList::Clear() { + // Clears the hashtable and gives ownership of the currently contained list to the + // user. + for (size_t cur_bucket = bucket_list_tail_; + cur_bucket != static_cast(-1); + cur_bucket = buckets_[cur_bucket].prev_bucket) { + buckets_[cur_bucket].last_elem = NULL; // this is how we indicate "empty". + } + bucket_list_tail_ = static_cast(-1); + Elem *ans = list_head_; + list_head_ = NULL; + return ans; +} + +template +const typename HashList::Elem* HashList::GetList() const { + return list_head_; +} + +template +inline void HashList::Delete(Elem *e) { + e->tail = freed_head_; + freed_head_ = e; +} + +template +inline typename HashList::Elem* HashList::Find(I key) { + size_t index = (static_cast(key) % hash_size_); + HashBucket &bucket = buckets_[index]; + if (bucket.last_elem == NULL) { + return NULL; // empty bucket. + } else { + Elem *head = (bucket.prev_bucket == static_cast(-1) ? + list_head_ : + buckets_[bucket.prev_bucket].last_elem->tail), + *tail = bucket.last_elem->tail; + for (Elem *e = head; e != tail; e = e->tail) + if (e->key == key) return e; + return NULL; // Not found. + } +} + +template +inline typename HashList::Elem* HashList::New() { + if (freed_head_) { + Elem *ans = freed_head_; + freed_head_ = freed_head_->tail; + return ans; + } else { + Elem *tmp = new Elem[allocate_block_size_]; + for (size_t i = 0; i+1 < allocate_block_size_; i++) + tmp[i].tail = tmp+i+1; + tmp[allocate_block_size_-1].tail = NULL; + freed_head_ = tmp; + allocated_.push_back(tmp); + return this->New(); + } +} + +template +HashList::~HashList() { + // First test whether we had any memory leak within the + // HashList, i.e. things for which the user did not call Delete(). + size_t num_in_list = 0, num_allocated = 0; + for (Elem *e = freed_head_; e != NULL; e = e->tail) + num_in_list++; + for (size_t i = 0; i < allocated_.size(); i++) { + num_allocated += allocate_block_size_; + delete[] allocated_[i]; + } + if (num_in_list != num_allocated) { + KALDI_WARN << "Possible memory leak: " << num_in_list + << " != " << num_allocated + << ": you might have forgotten to call Delete on " + << "some Elems"; + } +} + + +template +void HashList::Insert(I key, T val) { + size_t index = (static_cast(key) % hash_size_); + HashBucket &bucket = buckets_[index]; + Elem *elem = New(); + elem->key = key; + elem->val = val; + + if (bucket.last_elem == NULL) { // Unoccupied bucket. Insert at + // head of bucket list (which is tail of regular list, they go in + // opposite directions). + if (bucket_list_tail_ == static_cast(-1)) { + // list was empty so this is the first elem. + KALDI_ASSERT(list_head_ == NULL); + list_head_ = elem; + } else { + // link in to the chain of Elems + buckets_[bucket_list_tail_].last_elem->tail = elem; + } + elem->tail = NULL; + bucket.last_elem = elem; + bucket.prev_bucket = bucket_list_tail_; + bucket_list_tail_ = index; + } else { + // Already-occupied bucket. Insert at tail of list of elements within + // the bucket. + elem->tail = bucket.last_elem->tail; + bucket.last_elem->tail = elem; + bucket.last_elem = elem; + } +} + +template +void HashList::InsertMore(I key, T val) { + size_t index = (static_cast(key) % hash_size_); + HashBucket &bucket = buckets_[index]; + Elem *elem = New(); + elem->key = key; + elem->val = val; + + KALDI_ASSERT(bucket.last_elem != NULL); // we assume there is already one element + if (bucket.last_elem->key == key) { // standard behavior: add as last element + elem->tail = bucket.last_elem->tail; + bucket.last_elem->tail = elem; + bucket.last_elem = elem; + return; + } + Elem *e = (bucket.prev_bucket == static_cast(-1) ? + list_head_ : buckets_[bucket.prev_bucket].last_elem->tail); + // find place to insert in linked list + while (e != bucket.last_elem->tail && e->key != key) e = e->tail; + KALDI_ASSERT(e->key == key); // not found? - should not happen + elem->tail = e->tail; + e->tail = elem; +} + + +} // end namespace kaldi + +#endif diff --git a/kaldi_io/src/kaldi/util/hash-list.h b/kaldi_io/src/kaldi/util/hash-list.h new file mode 100644 index 0000000..4524759 --- /dev/null +++ b/kaldi_io/src/kaldi/util/hash-list.h @@ -0,0 +1,140 @@ +// util/hash-list.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_HASH_LIST_H_ +#define KALDI_UTIL_HASH_LIST_H_ +#include +#include +#include +#include +#include +#include "util/stl-utils.h" + + +/* This header provides utilities for a structure that's used in a decoder (but + is quite generic in nature so we implement and test it separately). + Basically it's a singly-linked list, but implemented in such a way that we + can quickly search for elements in the list. We give it a slightly richer + interface than just a hash and a list. The idea is that we want to separate + the hash part and the list part: basically, in the decoder, we want to have a + single hash for the current frame and the next frame, because by the time we + need to access the hash for the next frame we no longer need the hash for the + previous frame. So we have an operation that clears the hash but leaves the + list structure intact. We also control memory management inside this object, + to avoid repeated new's/deletes. + + See hash-list-test.cc for an example of how to use this object. +*/ + + +namespace kaldi { + +template class HashList { + + public: + struct Elem { + I key; + T val; + Elem *tail; + }; + + /// Constructor takes no arguments. Call SetSize to inform it of the likely size. + HashList(); + + /// Clears the hash and gives the head of the current list to the user; + /// ownership is transferred to the user (the user must call Delete() + /// for each element in the list, at his/her leisure). + Elem *Clear(); + + /// Gives the head of the current list to the user. Ownership retained in the + /// class. Caution: in December 2013 the return type was changed to const Elem* + /// and this function was made const. You may need to change some types of + /// local Elem* variables to const if this produces compilation errors. + const Elem *GetList() const; + + /// Think of this like delete(). It is to be called for each Elem in turn + /// after you "obtained ownership" by doing Clear(). This is not the opposite of + /// Insert, it is the opposite of New. It's really a memory operation. + inline void Delete(Elem *e); + + /// This should probably not be needed to be called directly by the user. Think of it as opposite + /// to Delete(); + inline Elem *New(); + + /// Find tries to find this element in the current list using the hashtable. + /// It returns NULL if not present. The Elem it returns is not owned by the user, + /// it is part of the internal list owned by this object, but the user is + /// free to modify the "val" element. + inline Elem *Find(I key); + + /// Insert inserts a new element into the hashtable/stored list. By calling this, + /// the user asserts that it is not already present (e.g. Find was called and + /// returned NULL). With current code, calling this if an element already exists will + /// result in duplicate elements in the structure, and Find() will find the + /// first one that was added. [but we don't guarantee this behavior]. + inline void Insert(I key, T val); + + /// Insert inserts another element with same key into the hashtable/stored list. + /// By calling this, the user asserts that one element with that key is already present. + /// We insert it that way, that all elements with the same key follow each other. + /// Find() will return the first one of the elements with the same key. + inline void InsertMore(I key, T val); + + /// SetSize tells the object how many hash buckets to allocate (should typically be + /// at least twice the number of objects we expect to go in the structure, for fastest + /// performance). It must be called while the hash is empty (e.g. after Clear() or + /// after initializing the object, but before adding anything to the hash. + void SetSize(size_t sz); + + /// Returns current number of hash buckets. + inline size_t Size() { return hash_size_; } + + ~HashList(); + private: + + struct HashBucket { + size_t prev_bucket; // index to next bucket (-1 if list tail). Note: list of buckets + // goes in opposite direction to list of Elems. + Elem *last_elem; // pointer to last element in this bucket (NULL if empty) + inline HashBucket(size_t i, Elem *e): prev_bucket(i), last_elem(e) {} + }; + + Elem *list_head_; // head of currently stored list. + size_t bucket_list_tail_; // tail of list of active hash buckets. + + size_t hash_size_; // number of hash buckets. + + std::vector buckets_; + + Elem *freed_head_; // head of list of currently freed elements. [ready for allocation] + + std::vector allocated_; // list of allocated blocks. + + static const size_t allocate_block_size_ = 1024; // Number of Elements to allocate in one block. Must be + // largish so storing allocated_ doesn't become a problem. +}; + + +} // end namespace kaldi + +#include "hash-list-inl.h" + +#endif diff --git a/kaldi_io/src/kaldi/util/kaldi-holder-inl.h b/kaldi_io/src/kaldi/util/kaldi-holder-inl.h new file mode 100644 index 0000000..6a66e61 --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-holder-inl.h @@ -0,0 +1,800 @@ +// util/kaldi-holder-inl.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_KALDI_HOLDER_INL_H_ +#define KALDI_UTIL_KALDI_HOLDER_INL_H_ + +#include +#include "util/kaldi-io.h" +#include "util/text-utils.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +/// \addtogroup holders +/// @{ + + +// KaldiObjectHolder is valid only for Kaldi objects with +// copy constructors, default constructors, and "normal" +// Kaldi Write and Read functions. E.g. it works for +// Matrix and Vector. +template class KaldiObjectHolder { + public: + typedef KaldiType T; + + KaldiObjectHolder(): t_(NULL) { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + t.Write(os, binary); + return os.good(); + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object: " << e.what(); + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; // Write failure. + } + } + + void Clear() { + if (t_) { + delete t_; + t_ = NULL; + } + } + + // Reads into the holder. + bool Read(std::istream &is) { + if (t_) delete t_; + t_ = new T; + // Don't want any existing state to complicate the read functioN: get new object. + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object, failed reading binary header\n"; + return false; + } + try { + t_->Read(is, is_binary); + return true; + } catch (std::exception &e) { + KALDI_WARN << "Exception caught reading Table object "; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + delete t_; + t_ = NULL; + return false; + } + } + + // Kaldi objects always have the stream open in binary mode for + // reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { + // code error if !t_. + if (!t_) KALDI_ERR << "KaldiObjectHolder::Value() called wrongly."; + return *t_; + } + + ~KaldiObjectHolder() { if (t_) delete t_; } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(KaldiObjectHolder); + T *t_; +}; + + +// BasicHolder is valid for float, double, bool, and integer +// types. There will be a compile time error otherwise, because +// we make sure that the {Write, Read}BasicType functions do not +// get instantiated for other types. + +template class BasicHolder { + public: + typedef BasicType T; + + BasicHolder(): t_(static_cast(-1)) { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + WriteBasicType(os, binary, t); + if (!binary) os << '\n'; // Makes output format more readable and + // easier to manipulate. + return os.good(); + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object: " << e.what(); + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; // Write failure. + } + } + + void Clear() { } + + // Reads into the holder. + bool Read(std::istream &is) { + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n"; + return false; + } + try { + int c; + if (!is_binary) { // This is to catch errors, the class would work without it.. + // Eat up any whitespace and make sure it's not newline. + while (isspace((c = is.peek())) && c != static_cast('\n')) is.get(); + if (is.peek() == '\n') { + KALDI_WARN << "Found newline but expected basic type."; + return false; // This is just to catch a more- + // likely-than average type of error (empty line before the token), since + // ReadBasicType will eat it up. + } + } + + ReadBasicType(is, is_binary, &t_); + + if (!is_binary) { // This is to catch errors, the class would work without it.. + // make sure there is a newline. + while (isspace((c = is.peek())) && c != static_cast('\n')) is.get(); + if (is.peek() != '\n') { + KALDI_WARN << "BasicHolder::Read, expected newline, got " + << CharToString(is.peek()) << ", position " << is.tellg(); + return false; + } + is.get(); // Consume the newline. + } + return true; + } catch (std::exception &e) { + KALDI_WARN << "Exception caught reading Table object"; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { + return t_; + } + + ~BasicHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicHolder); + + T t_; +}; + + +/// A Holder for a vector of basic types, e.g. +/// std::vector, std::vector, and so on. +/// Note: a basic type is defined as a type for which ReadBasicType +/// and WriteBasicType are implemented, i.e. integer and floating +/// types, and bool. +template class BasicVectorHolder { + public: + typedef std::vector T; + + BasicVectorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { // need to write the size, in binary mode. + KALDI_ASSERT(static_cast(static_cast(t.size())) == t.size()); + // Or this Write routine cannot handle such a large vector. + // use int32 because it's fixed size regardless of compilation. + // change to int64 (plus in Read function) if this becomes a problem. + WriteBasicType(os, binary, static_cast(t.size())); + for (typename std::vector::const_iterator iter = t.begin(); + iter != t.end(); ++iter) + WriteBasicType(os, binary, *iter); + + } else { + for (typename std::vector::const_iterator iter = t.begin(); + iter != t.end(); ++iter) + WriteBasicType(os, binary, *iter); + os << '\n'; // Makes output format more readable and + // easier to manipulate. In text mode, this function writes something like + // "1 2 3\n". + } + return os.good(); + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object (BasicVector). "; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; // Write failure. + } + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n"; + return false; + } + if (!is_binary) { + // In text mode, we terminate with newline. + std::string line; + getline(is, line); // this will discard the \n, if present. + if (is.fail()) { + KALDI_WARN << "BasicVectorHolder::Read, error reading line " << (is.eof() ? "[eof]" : ""); + return false; // probably eof. fail in any case. + } + std::istringstream line_is(line); + try { + while (1) { + line_is >> std::ws; // eat up whitespace. + if (line_is.eof()) break; + BasicType bt; + ReadBasicType(line_is, false, &bt); + t_.push_back(bt); + } + return true; + } catch(std::exception &e) { + KALDI_WARN << "BasicVectorHolder::Read, could not interpret line: " << line; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; + } + } else { // binary mode. + size_t filepos = is.tellg(); + try { + int32 size; + ReadBasicType(is, true, &size); + t_.resize(size); + for (typename std::vector::iterator iter = t_.begin(); + iter != t_.end(); + ++iter) { + ReadBasicType(is, true, &(*iter)); + } + return true; + } catch (...) { + KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data at archive entry beginning at file position " << filepos; + return false; + } + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { return t_; } + + ~BasicVectorHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorHolder); + T t_; +}; + + +/// BasicVectorVectorHolder is a Holder for a vector of vector of +/// a basic type, e.g. std::vector >. +/// Note: a basic type is defined as a type for which ReadBasicType +/// and WriteBasicType are implemented, i.e. integer and floating +/// types, and bool. +template class BasicVectorVectorHolder { + public: + typedef std::vector > T; + + BasicVectorVectorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { // need to write the size, in binary mode. + KALDI_ASSERT(static_cast(static_cast(t.size())) == t.size()); + // Or this Write routine cannot handle such a large vector. + // use int32 because it's fixed size regardless of compilation. + // change to int64 (plus in Read function) if this becomes a problem. + WriteBasicType(os, binary, static_cast(t.size())); + for (typename std::vector >::const_iterator iter = t.begin(); + iter != t.end(); ++iter) { + KALDI_ASSERT(static_cast(static_cast(iter->size())) == iter->size()); + WriteBasicType(os, binary, static_cast(iter->size())); + for (typename std::vector::const_iterator iter2=iter->begin(); + iter2 != iter->end(); ++iter2) { + WriteBasicType(os, binary, *iter2); + } + } + } else { // text mode... + // In text mode, we write out something like (for integers): + // "1 2 3 ; 4 5 ; 6 ; ; 7 8 9 ;\n" + // where the semicolon is a terminator, not a separator + // (a separator would cause ambiguity between an + // empty list, and a list containing a single empty list). + for (typename std::vector >::const_iterator iter = t.begin(); + iter != t.end(); + ++iter) { + for (typename std::vector::const_iterator iter2=iter->begin(); + iter2 != iter->end(); ++iter2) + WriteBasicType(os, binary, *iter2); + os << "; "; + } + os << '\n'; + } + return os.good(); + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object. "; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; // Write failure. + } + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Failed reading binary header\n"; + return false; + } + if (!is_binary) { + // In text mode, we terminate with newline. + try { // catching errors from ReadBasicType.. + std::vector v; // temporary vector + while (1) { + int i = is.peek(); + if (i == -1) { + KALDI_WARN << "Unexpected EOF"; + return false; + } else if (static_cast(i) == '\n') { + if (!v.empty()) { + KALDI_WARN << "No semicolon before newline (wrong format)"; + return false; + } else { is.get(); return true; } + } else if (std::isspace(i)) { + is.get(); + } else if (static_cast(i) == ';') { + t_.push_back(v); + v.clear(); + is.get(); + } else { // some object we want to read... + BasicType b; + ReadBasicType(is, false, &b); // throws on error. + v.push_back(b); + } + } + } catch(std::exception &e) { + KALDI_WARN << "BasicVectorVectorHolder::Read, read error"; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; + } + } else { // binary mode. + size_t filepos = is.tellg(); + try { + int32 size; + ReadBasicType(is, true, &size); + t_.resize(size); + for (typename std::vector >::iterator iter = t_.begin(); + iter != t_.end(); + ++iter) { + int32 size2; + ReadBasicType(is, true, &size2); + iter->resize(size2); + for (typename std::vector::iterator iter2 = iter->begin(); + iter2 != iter->end(); + ++iter2) + ReadBasicType(is, true, &(*iter2)); + } + return true; + } catch (...) { + KALDI_WARN << "Read error or unexpected data at archive entry beginning at file position " << filepos; + return false; + } + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { return t_; } + + ~BasicVectorVectorHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorVectorHolder); + T t_; +}; + + +/// BasicPairVectorHolder is a Holder for a vector of pairs of +/// a basic type, e.g. std::vector >. +/// Note: a basic type is defined as a type for which ReadBasicType +/// and WriteBasicType are implemented, i.e. integer and floating +/// types, and bool. +template class BasicPairVectorHolder { + public: + typedef std::vector > T; + + BasicPairVectorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t) { + InitKaldiOutputStream(os, binary); // Puts binary header if binary mode. + try { + if (binary) { // need to write the size, in binary mode. + KALDI_ASSERT(static_cast(static_cast(t.size())) == t.size()); + // Or this Write routine cannot handle such a large vector. + // use int32 because it's fixed size regardless of compilation. + // change to int64 (plus in Read function) if this becomes a problem. + WriteBasicType(os, binary, static_cast(t.size())); + for (typename T::const_iterator iter = t.begin(); + iter != t.end(); ++iter) { + WriteBasicType(os, binary, iter->first); + WriteBasicType(os, binary, iter->second); + } + } else { // text mode... + // In text mode, we write out something like (for integers): + // "1 2 ; 4 5 ; 6 7 ; 8 9 \n" + // where the semicolon is a separator, not a terminator. + for (typename T::const_iterator iter = t.begin(); + iter != t.end();) { + WriteBasicType(os, binary, iter->first); + WriteBasicType(os, binary, iter->second); + ++iter; + if (iter != t.end()) + os << "; "; + } + os << '\n'; + } + return os.good(); + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught writing Table object. "; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; // Write failure. + } + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + bool is_binary; + if (!InitKaldiInputStream(is, &is_binary)) { + KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n"; + return false; + } + if (!is_binary) { + // In text mode, we terminate with newline. + try { // catching errors from ReadBasicType.. + std::vector v; // temporary vector + while (1) { + int i = is.peek(); + if (i == -1) { + KALDI_WARN << "Unexpected EOF"; + return false; + } else if (static_cast(i) == '\n') { + if (t_.empty() && v.empty()) { + is.get(); + return true; + } else if (v.size() == 2) { + t_.push_back(std::make_pair(v[0], v[1])); + is.get(); + return true; + } else { + KALDI_WARN << "Unexpected newline, reading vector >; got " + << v.size() << " elements, expected 2."; + return false; + } + } else if (std::isspace(i)) { + is.get(); + } else if (static_cast(i) == ';') { + if (v.size() != 2) { + KALDI_WARN << "Wrong input format, reading vector >; got " + << v.size() << " elements, expected 2."; + return false; + } + t_.push_back(std::make_pair(v[0], v[1])); + v.clear(); + is.get(); + } else { // some object we want to read... + BasicType b; + ReadBasicType(is, false, &b); // throws on error. + v.push_back(b); + } + } + } catch(std::exception &e) { + KALDI_WARN << "BasicPairVectorHolder::Read, read error"; + if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + return false; + } + } else { // binary mode. + size_t filepos = is.tellg(); + try { + int32 size; + ReadBasicType(is, true, &size); + t_.resize(size); + for (typename T::iterator iter = t_.begin(); + iter != t_.end(); + ++iter) { + ReadBasicType(is, true, &(iter->first)); + ReadBasicType(is, true, &(iter->second)); + } + return true; + } catch (...) { + KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data at archive entry beginning at file position " << filepos; + return false; + } + } + } + + // Objects read/written with the Kaldi I/O functions always have the stream + // open in binary mode for reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { return t_; } + + ~BasicPairVectorHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(BasicPairVectorHolder); + T t_; +}; + + + + +// We define a Token as a nonempty, printable, whitespace-free std::string. +// The binary and text formats here are the same (newline-terminated) +// and as such we don't bother with the binary-mode headers. +class TokenHolder { + public: + typedef std::string T; + + TokenHolder() {} + + static bool Write(std::ostream &os, bool, const T &t) { // ignore binary-mode. + KALDI_ASSERT(IsToken(t)); + os << t << '\n'; + return os.good(); + } + + void Clear() { t_.clear(); } + + // Reads into the holder. + bool Read(std::istream &is) { + is >> t_; + if (is.fail()) return false; + char c; + while (isspace(c = is.peek()) && c!= '\n') is.get(); + if (is.peek() != '\n') { + KALDI_ERR << "TokenHolder::Read, expected newline, got char " << CharToString(is.peek()) + << ", at stream pos " << is.tellg(); + return false; + } + is.get(); // get '\n' + return true; + } + + + // Since this is fundamentally a text format, read in text mode (would work + // fine either way, but doing it this way will exercise more of the code). + static bool IsReadInBinary() { return false; } + + const T &Value() const { return t_; } + + ~TokenHolder() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(TokenHolder); + T t_; +}; + +// A Token is a nonempty, whitespace-free std::string. +// Class TokenVectorHolder is a Holder class for vectors of these. +class TokenVectorHolder { + public: + typedef std::vector T; + + TokenVectorHolder() { } + + static bool Write(std::ostream &os, bool, const T &t) { // ignore binary-mode. + for (std::vector::const_iterator iter = t.begin(); + iter != t.end(); + ++iter) { + KALDI_ASSERT(IsToken(*iter)); // make sure it's whitespace-free, printable and nonempty. + os << *iter << ' '; + } + os << '\n'; + return os.good(); + } + + void Clear() { t_.clear(); } + + + // Reads into the holder. + bool Read(std::istream &is) { + t_.clear(); + + // there is no binary/non-binary mode. + + std::string line; + getline(is, line); // this will discard the \n, if present. + if (is.fail()) { + KALDI_WARN << "BasicVectorHolder::Read, error reading line " << (is.eof() ? "[eof]" : ""); + return false; // probably eof. fail in any case. + } + const char *white_chars = " \t\n\r\f\v"; + SplitStringToVector(line, white_chars, true, &t_); // true== omit empty strings e.g. + // between spaces. + return true; + } + + // Read in text format since it's basically a text-mode thing.. doesn't really matter, + // it would work either way since we ignore the extra '\r'. + static bool IsReadInBinary() { return false; } + + const T &Value() const { return t_; } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(TokenVectorHolder); + T t_; +}; + + +class HtkMatrixHolder { + public: + typedef std::pair, HtkHeader> T; + + HtkMatrixHolder() {} + + static bool Write(std::ostream &os, bool binary, const T &t) { + if (!binary) + KALDI_ERR << "Non-binary HTK-format write not supported."; + bool ans = WriteHtk(os, t.first, t.second); + if (!ans) + KALDI_WARN << "Error detected writing HTK-format matrix."; + return ans; + } + + void Clear() { t_.first.Resize(0, 0); } + + // Reads into the holder. + bool Read(std::istream &is) { + bool ans = ReadHtk(is, &t_.first, &t_.second); + if (!ans) { + KALDI_WARN << "Error detected reading HTK-format matrix."; + return false; + } + return ans; + } + + // HTK-format matrices only read in binary. + static bool IsReadInBinary() { return true; } + + const T &Value() const { return t_; } + + + // No destructor. + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(HtkMatrixHolder); + T t_; +}; + +// SphinxMatrixHolder can be used to read and write feature files in +// CMU Sphinx format. 13-dimensional big-endian features are assumed. +// The ultimate reference is SphinxBase's source code (for example see +// feat_s2mfc_read() in src/libsphinxbase/feat/feat.c). +// We can't fully automate the detection of machine/feature file endianess +// mismatch here, because for this Sphinx relies on comparing the feature +// file's size with the number recorded in its header. We are working with +// streams, however(what happens if this is a Kaldi archive?). This should +// be no problem, because the usage help of Sphinx' "wave2feat" for example +// says that Sphinx features are always big endian. +// Note: the kFeatDim defaults to 13, see forward declaration in kaldi-holder.h +template class SphinxMatrixHolder { + public: + typedef Matrix T; + + SphinxMatrixHolder() {} + + void Clear() { feats_.Resize(0, 0); } + + // Writes Sphinx-format features + static bool Write(std::ostream &os, bool binary, const T &m) { + if (!binary) { + KALDI_WARN << "SphinxMatrixHolder can't write Sphinx features in text "; + return false; + } + + int32 size = m.NumRows() * m.NumCols(); + if (MachineIsLittleEndian()) + KALDI_SWAP4(size); + os.write((char*) &size, sizeof(size)); // write the header + + for (MatrixIndexT i = 0; i < m.NumRows(); i++) { + float32 tmp[m.NumCols()]; + for (MatrixIndexT j = 0; j < m.NumCols(); j++) { + tmp[j] = static_cast(m(i, j)); + if (MachineIsLittleEndian()) + KALDI_SWAP4(tmp[j]); + } + os.write((char*) tmp, sizeof(tmp)); + } + + return true; + } + + // Reads the features into a Kaldi Matrix + bool Read(std::istream &is) { + int32 nmfcc; + + is.read((char*) &nmfcc, sizeof(nmfcc)); + if (MachineIsLittleEndian()) + KALDI_SWAP4(nmfcc); + KALDI_VLOG(2) << "#feats: " << nmfcc; + int32 nfvec = nmfcc / kFeatDim; + if ((nmfcc % kFeatDim) != 0) { + KALDI_WARN << "Sphinx feature count is inconsistent with vector length "; + return false; + } + + feats_.Resize(nfvec, kFeatDim); + for (MatrixIndexT i = 0; i < feats_.NumRows(); i++) { + if (sizeof(BaseFloat) == sizeof(float32)) { + is.read((char*) feats_.RowData(i), kFeatDim * sizeof(float32)); + if (!is.good()) { + KALDI_WARN << "Unexpected error/EOF while reading Sphinx features "; + return false; + } + if (MachineIsLittleEndian()) { + for (MatrixIndexT j=0; j < kFeatDim; j++) + KALDI_SWAP4(feats_(i, j)); + } + } else { // KALDI_DOUBLEPRECISION=1 + float32 tmp[kFeatDim]; + is.read((char*) tmp, sizeof(tmp)); + if (!is.good()) { + KALDI_WARN << "Unexpected error/EOF while reading Sphinx features "; + return false; + } + for (MatrixIndexT j=0; j < kFeatDim; j++) { + if (MachineIsLittleEndian()) + KALDI_SWAP4(tmp[j]); + feats_(i, j) = static_cast(tmp[j]); + } + } + } + + return true; + } + + // Only read in binary + static bool IsReadInBinary() { return true; } + + const T &Value() const { return feats_; } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(SphinxMatrixHolder); + T feats_; +}; + + +/// @} end "addtogroup holders" + +} // end namespace kaldi + + + +#endif diff --git a/kaldi_io/src/kaldi/util/kaldi-holder.h b/kaldi_io/src/kaldi/util/kaldi-holder.h new file mode 100644 index 0000000..95f1183 --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-holder.h @@ -0,0 +1,207 @@ +// util/kaldi-holder.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_KALDI_HOLDER_H_ +#define KALDI_UTIL_KALDI_HOLDER_H_ + +#include +#include "util/kaldi-io.h" +#include "util/text-utils.h" +#include "matrix/kaldi-vector.h" + +namespace kaldi { + + +// The Table class uses a Holder class to wrap objects, and make them behave +// in a "normalized" way w.r.t. reading and writing, so the Table class can +// be template-ized without too much trouble. Look below this +// comment (search for GenericHolder) to see what it looks like. +// +// Requirements of the holder class: +// +// They can only contain objects that can be read/written without external +// information; other objects cannot be stored in this type of archive. +// +// In terms of what functions it should have, see GenericHolder below. +// It is just for documentation. +// +// (1) Requirements of the Read and Write functions +// +// The Read and Write functions should have the property that in a longer +// file, if the Read function is started from where the Write function started +// writing, it should go to where the Write function stopped writing, in either +// text or binary mode (but it's OK if it doesn't eat up trailing space). +// +// [Desirable property: when writing in text mode the output should contain +// exactly one newline, at the end of the output; this makes it easier to manipulate] +// +// [Desirable property for classes: the output should just be a binary-mode +// header (if in binary mode and it's a Kaldi object, or no header +// othewise), and then the output of Object.Write(). This means that when +// written to individual files with the scp: type of wspecifier, we can read +// the individual files in the "normal" Kaldi way by reading the binary +// header and then the object.] +// +// +// The Write function takes a 'binary' argument. In general, each object will +// have two formats: text and binary. However, it's permitted to throw() if +// asked to read in the text format if there is none. The file will be open, if +// the file system has binary/text modes, in the corresponding mode. However, +// the object should have a file-mode in which it can read either text or binary +// output. It announces this via the static IsReadInBinary() function. This +// will generally be the binary mode and it means that where necessary, in text +// formats, we must ignore \r characters. +// +// Memory requirements: if it allocates memory, the destructor should +// free that memory. Copying and assignment of Holder objects may be +// disallowed as the Table code never does this. + + +/// GenericHolder serves to document the requirements of the Holder interface; +/// it's not intended to be used. +template class GenericHolder { + public: + typedef SomeType T; + + /// Must have a constructor that takes no arguments. + GenericHolder() { } + + /// Write writes this object of type T. Possibly also writes a binary-mode + /// header so that the Read function knows which mode to read in (since the + /// Read function does not get this information). It's a static member so we + /// can write those not inside this class (can use this function with Value() + /// to write from this class). The Write method may throw if it cannot write + /// the object in the given (binary/non-binary) mode. The holder object can + /// assume the stream has been opened in the given mode (where relevant). The + /// object can write the data how it likes. + static bool Write(std::ostream &os, bool binary, const T &t); + + /// Reads into the holder. Must work out from the stream (which will be opened + /// on Windows in binary mode if the IsReadInBinary() function of this class + /// returns true, and text mode otherwise) whether the actual data is binary or + /// not (usually via reading the Kaldi binary-mode header). We put the + /// responsibility for reading the Kaldi binary-mode header in the Read + /// function (rather than making the binary mode an argument to this function), + /// so that for non-Kaldi binary files we don't have to write the header, which + /// would prevent the file being read by non-Kaldi programs (e.g. if we write + /// to individual files using an scp). + /// + /// Read must deallocate any existing data we have here, if applicable (must + /// not assume the object was newly constructed). + /// + /// Returns true on success. + bool Read(std::istream &is); + + /// IsReadInBinary() will return true if the object wants the file to be + /// opened in binary for reading (if the file system has binary/text modes), + /// and false otherwise. Static function. Kaldi objects always return true + /// as they always read in binary mode. Note that we must be able to read, in + /// this mode, objects written in both text and binary mode by Write (which + /// may mean ignoring "\r" characters). I doubt we will ever want this + /// function to return false. + static bool IsReadInBinary() { return true; } + + /// Returns the value of the object held here. Will only + /// ever be called if Read() has been previously called and it returned + /// true (so OK to throw exception if no object was read). + const T &Value() const { return t_; } // if t is a pointer, would return *t_; + + /// The Clear() function doesn't have to do anything. Its purpose is to + /// allow the object to free resources if they're no longer needed. + void Clear() { } + + /// If the object held pointers, the destructor would free them. + ~GenericHolder() { } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(GenericHolder); + T t_; // t_ may alternatively be of type T*. +}; + + +// See kaldi-holder-inl.h for examples of some actual Holder +// classes and templates. + + +// The following two typedefs should probably be in their own file, but they're +// here until there are enough of them to warrant their own header. + + +/// \addtogroup holders +/// @{ + +/// KaldiObjectHolder works for Kaldi objects that have the "standard" Read and Write +/// functions, and a copy constructor. +template class KaldiObjectHolder; + +/// BasicHolder is valid for float, double, bool, and integer +/// types. There will be a compile time error otherwise, because +/// we make sure that the {Write, Read}BasicType functions do not +/// get instantiated for other types. +template class BasicHolder; + + +// A Holder for a vector of basic types, e.g. +// std::vector, std::vector, and so on. +// Note: a basic type is defined as a type for which ReadBasicType +// and WriteBasicType are implemented, i.e. integer and floating +// types, and bool. +template class BasicVectorHolder; + + +// A holder for vectors of vectors of basic types, e.g. +// std::vector >, and so on. +// Note: a basic type is defined as a type for which ReadBasicType +// and WriteBasicType are implemented, i.e. integer and floating +// types, and bool. +template class BasicVectorVectorHolder; + +// A holder for vectors of pairsof basic types, e.g. +// std::vector >, and so on. +// Note: a basic type is defined as a type for which ReadBasicType +// and WriteBasicType are implemented, i.e. integer and floating +// types, and bool. Text format is (e.g. for integers), +// "1 12 ; 43 61 ; 17 8 \n" +template class BasicPairVectorHolder; + +/// We define a Token (not a typedef, just a word) as a nonempty, printable, +/// whitespace-free std::string. The binary and text formats here are the same +/// (newline-terminated) and as such we don't bother with the binary-mode headers. +class TokenHolder; + +/// Class TokenVectorHolder is a Holder class for vectors of Tokens (T == std::string). +class TokenVectorHolder; + +/// A class for reading/writing HTK-format matrices. +/// T == std::pair, HtkHeader> +class HtkMatrixHolder; + +/// A class for reading/writing Sphinx format matrices. +template class SphinxMatrixHolder; + + +/// @} end "addtogroup holders" + + +} // end namespace kaldi + +#include "kaldi-holder-inl.h" + +#endif diff --git a/kaldi_io/src/kaldi/util/kaldi-io-inl.h b/kaldi_io/src/kaldi/util/kaldi-io-inl.h new file mode 100644 index 0000000..7df7505 --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-io-inl.h @@ -0,0 +1,45 @@ +// util/kaldi-io-inl.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_KALDI_IO_INL_H_ +#define KALDI_UTIL_KALDI_IO_INL_H_ + + +namespace kaldi { + +bool Input::Open(const std::string &rxfilename, bool *binary) { + return OpenInternal(rxfilename, true, binary); +} + +bool Input::OpenTextMode(const std::string &rxfilename) { + return OpenInternal(rxfilename, false, NULL); +} + +bool Input::IsOpen() { + return impl_ != NULL; +} + +bool Output::IsOpen() { + return impl_ != NULL; +} + + +} // end namespace kaldi. + + +#endif diff --git a/kaldi_io/src/kaldi/util/kaldi-io.h b/kaldi_io/src/kaldi/util/kaldi-io.h new file mode 100644 index 0000000..f2c7563 --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-io.h @@ -0,0 +1,264 @@ +// util/kaldi-io.h + +// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_KALDI_IO_H_ +#define KALDI_UTIL_KALDI_IO_H_ + +#include // For isspace. +#include +#include +#include "base/kaldi-common.h" +#ifdef _MSC_VER +# include +# include +#endif + + + +namespace kaldi { + +class OutputImplBase; // Forward decl; defined in a .cc file +class InputImplBase; // Forward decl; defined in a .cc file + +/// \addtogroup io_group +/// @{ + +// The Output and Input classes handle stream-opening for "extended" filenames +// that include actual files, standard-input/standard-output, pipes, and +// offsets into actual files. They also handle reading and writing the +// binary-mode headers for Kaldi files, where applicable. The classes have +// versions of the Open routines that throw and do not throw, depending whether +// the calling code wants to catch the errors or not; there are also versions +// that write (or do not write) the Kaldi binary-mode header that says if it's +// binary mode. Generally files that contain Kaldi objects will have the header +// on, so we know upon reading them whether they have the header. So you would +// use the OpenWithHeader routines for these (or the constructor); but other +// types of objects (e.g. FSTs) would have files without a header so you would +// use OpenNoHeader. + +// We now document the types of extended filenames that we use. +// +// A "wxfilename" is an extended filename for writing. It can take three forms: +// (1) Filename: e.g. "/some/filename", "./a/b/c", "c:\Users\dpovey\My Documents\\boo" +// (whatever the actual file-system interprets) +// (2) Standard output: "" or "-" +// (3) A pipe: e.g. "gunzip -c /tmp/abc.gz |" +// +// +// A "rxfilename" is an extended filename for reading. It can take four forms: +// (1) An actual filename, whatever the file-system can read, e.g. "/my/file". +// (2) Standard input: "" or "-" +// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz" +// (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871" +// [these are created by the Table and TableWriter classes; I may also write +// a program that creates them for arbitrary files] +// + + +// Typical usage: +// ... +// bool binary; +// MyObject.Write(Output(some_filename, binary).Stream(), binary); +// +// ... more extensive example: +// { +// Output ko(some_filename, binary); +// MyObject1.Write(ko.Stream(), binary); +// MyObject2.Write(ko.Stream(), binary); +// } + + + +enum OutputType { + kNoOutput, + kFileOutput, + kStandardOutput, + kPipeOutput +}; + +/// ClassifyWxfilename interprets filenames as follows: +/// - kNoOutput: invalid filenames (leading or trailing space, things that look +/// like wspecifiers and rspecifiers or like pipes to read from with leading |. +/// - kFileOutput: Normal filenames +/// - kStandardOutput: The empty string or "-", interpreted as standard output +/// - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |" +OutputType ClassifyWxfilename(const std::string &wxfilename); + +enum InputType { + kNoInput, + kFileInput, + kStandardInput, + kOffsetFileInput, + kPipeInput +}; + +/// ClassifyRxfilenames interprets filenames for reading as follows: +/// - kNoInput: invalid filenames (leading or trailing space, things that +/// look like wspecifiers and rspecifiers or pipes to write to +/// with trailing |. +/// - kFileInput: normal filenames +/// - kStandardInput: the empty string or "-" +/// - kPipeInput: e.g. "| gzip -c > blah.gz" +/// - kOffsetFileInput: offsets into files, e.g. /some/filename:12970 +InputType ClassifyRxfilename(const std::string &rxfilename); + + +class Output { + public: + // The normal constructor, provided for convenience. + // Equivalent to calling with default constructor then Open() + // with these arguments. + Output(const std::string &filename, bool binary, bool write_header = true); + + Output(): impl_(NULL) {}; + + /// This opens the stream, with the given mode (binary or text). It returns + /// true on success and false on failure. However, it will throw if something + /// was already open and could not be closed (to avoid this, call Close() + /// first. if write_header == true and binary == true, it writes the Kaldi + /// binary-mode header ('\0' then 'B'). You may call Open even if it is + /// already open; it will close the existing stream and reopen (however if + /// closing the old stream failed it will throw). + bool Open(const std::string &wxfilename, bool binary, bool write_header); + + inline bool IsOpen(); // return true if we have an open stream. Does not imply + // stream is good for writing. + + std::ostream &Stream(); // will throw if not open; else returns stream. + + // Close closes the stream. Calling Close is never necessary unless you + // want to avoid exceptions being thrown. There are times when calling + // Close will hurt efficiency (basically, when using offsets into files, + // and using the same Input object), + // but most of the time the user won't be doing this directly, it will + // be done in kaldi-table.{h, cc}, so you don't have to worry about it. + bool Close(); + + // This will throw if stream could not be closed (to check error status, + // call Close()). + ~Output(); + + private: + OutputImplBase *impl_; // non-NULL if open. + std::string filename_; + KALDI_DISALLOW_COPY_AND_ASSIGN(Output); +}; + + +// bool binary_in; +// Input ki(some_filename, &binary_in); +// MyObject.Read(ki, binary_in); +// +// ... more extensive example: +// +// { +// bool binary_in; +// Input ki(some_filename, &binary_in); +// MyObject1.Read(ki.Stream(), &binary_in); +// MyObject2.Write(ki.Stream(), &binary_in); +// } +// Note that to catch errors you need to use try.. catch. +// Input communicates errors by throwing exceptions. + + +// Input interprets four kinds of filenames: +// (1) Normal filenames +// (2) The empty string or "-", interpreted as standard output +// (3) Pipes, e.g. "| gzip -c > some_file.gz" +// (4) Offsets into [real] files, e.g. "/my/filename:12049" +// The last one has no correspondence in Output. + + +class Input { + public: + /// The normal constructor. Opens the stream in binary mode. + /// Equivalent to calling the default constructor followed by Open(); then, if + /// binary != NULL, it calls ReadHeader(), putting the output in "binary"; it + /// throws on error. + Input(const std::string &rxfilename, bool *contents_binary = NULL); + + Input(): impl_(NULL) {} + + // Open opens the stream for reading (the mode, where relevant, is binary; use + // OpenTextMode for text-mode, we made this a separate function rather than a + // boolean argument, to avoid confusion with Kaldi's text/binary distinction, + // since reading in the file system's text mode is unusual.) If + // contents_binary != NULL, it reads the binary-mode header and puts it in the + // "binary" variable. Returns true on success. If it returns false it will + // not be open. You may call Open even if it is already open; it will close + // the existing stream and reopen (however if closing the old stream failed it + // will throw). + inline bool Open(const std::string &rxfilename, bool *contents_binary = NULL); + + // As Open but (if the file system has text/binary modes) opens in text mode; + // you shouldn't ever have to use this as in Kaldi we read even text files in + // binary mode (and ignore the \r). + inline bool OpenTextMode(const std::string &rxfilename); + + // Return true if currently open for reading and Stream() will + // succeed. Does not guarantee that the stream is good. + inline bool IsOpen(); + + // It is never necessary or helpful to call Close, except if + // you are concerned about to many filehandles being open. + // Close does not throw. + void Close(); + + // Returns the underlying stream. Throws if !IsOpen() + std::istream &Stream(); + + // Destructor does not throw: input streams may legitimately fail so we + // don't worry about the status when we close them. + ~Input(); + private: + bool OpenInternal(const std::string &rxfilename, bool file_binary, bool *contents_binary); + InputImplBase *impl_; + KALDI_DISALLOW_COPY_AND_ASSIGN(Input); +}; + +template inline void ReadKaldiObject(const std::string &filename, + C *c) { + bool binary_in; + Input ki(filename, &binary_in); + c->Read(ki.Stream(), binary_in); +} + +template inline void WriteKaldiObject(const C &c, + const std::string &filename, + bool binary) { + Output ko(filename, binary); + c.Write(ko.Stream(), binary); +} + +/// PrintableRxfilename turns the rxfilename into a more human-readable +/// form for error reporting, i.e. it does quoting and escaping and +/// replaces "" or "-" with "standard input". +std::string PrintableRxfilename(std::string rxfilename); + +/// PrintableWxfilename turns the filename into a more human-readable +/// form for error reporting, i.e. it does quoting and escaping and +/// replaces "" or "-" with "standard output". +std::string PrintableWxfilename(std::string wxfilename); + +/// @} + +} // end namespace kaldi. + +#include "kaldi-io-inl.h" + +#endif diff --git a/kaldi_io/src/kaldi/util/kaldi-pipebuf.h b/kaldi_io/src/kaldi/util/kaldi-pipebuf.h new file mode 100644 index 0000000..43e5a2e --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-pipebuf.h @@ -0,0 +1,90 @@ +// util/kaldi-pipebuf.h + +// Copyright 2009-2011 Ondrej Glembek + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +/** @file kaldi-pipebuf.h + * This is an Kaldi C++ Library header. + */ + +#ifndef KALDI_UTIL_KALDI_PIPEBUF_H_ +#define KALDI_UTIL_KALDI_PIPEBUF_H_ + +#if defined(_LIBCPP_VERSION) // libc++ +#include "basic-filebuf.h" +#else +#include +#endif + +namespace kaldi +{ +// This class provides a way to initialize a filebuf with a FILE* pointer +// directly; it will not close the file pointer when it is deleted. +// The C++ standard does not allow implementations of C++ to provide +// this constructor within basic_filebuf, which makes it hard to deal +// with pipes using completely native C++. This is a workaround + +#ifdef _MSC_VER +#elif defined(_LIBCPP_VERSION) // libc++ +template > +class basic_pipebuf : public basic_filebuf +{ + public: + typedef basic_pipebuf ThisType; + + public: + basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) + : basic_filebuf() { + this->open(fptr, mode); + if (!this->is_open()) { + KALDI_WARN << "Error initializing pipebuf"; // probably indicates + // code error, if the fptr was good. + return; + } + } +}; // class basic_pipebuf +#else +template > +class basic_pipebuf : public std::basic_filebuf +{ + public: + typedef basic_pipebuf ThisType; + + public: + basic_pipebuf(FILE *fptr, std::ios_base::openmode mode) + : std::basic_filebuf() { + this->_M_file.sys_open(fptr, mode); + if (!this->is_open()) { + KALDI_WARN << "Error initializing pipebuf"; // probably indicates + // code error, if the fptr was good. + return; + } + this->_M_mode = mode; + this->_M_buf_size = BUFSIZ; + this->_M_allocate_internal_buffer(); + this->_M_reading = false; + this->_M_writing = false; + this->_M_set_buffer(-1); + } +}; // class basic_pipebuf +#endif // _MSC_VER + +}; // namespace kaldi + +#endif // KALDI_UTIL_KALDI_PIPEBUF_H_ + diff --git a/kaldi_io/src/kaldi/util/kaldi-table-inl.h b/kaldi_io/src/kaldi/util/kaldi-table-inl.h new file mode 100644 index 0000000..6b73c88 --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-table-inl.h @@ -0,0 +1,2246 @@ +// util/kaldi-table-inl.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_KALDI_TABLE_INL_H_ +#define KALDI_UTIL_KALDI_TABLE_INL_H_ + +#include +#include "util/kaldi-io.h" +#include "util/text-utils.h" +#include "util/stl-utils.h" // for StringHasher. + + +namespace kaldi { + +/// \addtogroup table_impl_types +/// @{ + +template class SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + // note that Open takes rxfilename not rspecifier. + virtual bool Open(const std::string &rxfilename) = 0; + virtual bool Done() const = 0; + virtual bool IsOpen() const = 0; + virtual std::string Key() = 0; + virtual const T &Value() = 0; + virtual void FreeCurrent() = 0; + virtual void Next() = 0; + virtual bool Close() = 0; + SequentialTableReaderImplBase() { } + virtual ~SequentialTableReaderImplBase() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(SequentialTableReaderImplBase); +}; + + +// This is the implementation for SequentialTableReader +// when it's actually a script file. +template class SequentialTableReaderScriptImpl: + public SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + + SequentialTableReaderScriptImpl(): state_(kUninitialized) { } + + virtual bool Open(const std::string &rspecifier) { + if (state_ != kUninitialized) + if (! Close()) // call Close() yourself to suppress this exception. + KALDI_ERR << "TableReader::Open, error closing previous input: " + << "rspecifier was " << rspecifier_; + bool binary; + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, &script_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kScriptRspecifier); + if (!script_input_.Open(script_rxfilename_, &binary)) { // Failure on Open + KALDI_WARN << "Failed to open script file " + << PrintableRxfilename(script_rxfilename_); + state_ = kUninitialized; + return false; + } else { // Open succeeded. + if (binary) { // script file should not be binary file.. + state_ = kError; // bad script file. + script_input_.Close(); + return false; + } else { + state_ = kFileStart; + Next(); + if (state_ == kError) { + script_input_.Close(); + return false; + } + if (opts_.permissive) { // Next() will have preloaded. + KALDI_ASSERT(state_ == kLoadSucceeded || state_ == kEof); + } else { + KALDI_ASSERT(state_ == kHaveScpLine || state_ == kEof); + } + return true; // Success. + } + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kEof: case kError: case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: return true; + case kUninitialized: return false; + default: KALDI_ERR << "IsOpen() called on invalid object."; // kFileStart is not valid + // state for user to call something on. + return false; + } + } + + virtual bool Done() const { + switch (state_) { + case kHaveScpLine: return false; + case kLoadSucceeded: case kLoadFailed: return false; + // These cases are because we want LoadCurrent() + // to be callable after Next() and to not change the Done() status [only Next() should change + // the Done() status]. + case kEof: case kError: return true; // Error condition, like Eof, counts as Done(); the destructor + // or Close() will inform the user of the error. + default: KALDI_ERR << "Done() called on TableReader object at the wrong time."; + return false; + } + } + + virtual std::string Key() { + // Valid to call this whenever Done() returns false. + switch (state_) { + case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: break; + default: + // coding error. + KALDI_ERR << "Key() called on TableReader object at the wrong time."; + } + return key_; + } + const T &Value() { + StateType orig_state = state_; + if (state_ == kHaveScpLine) LoadCurrent(); // Takes + // state_ to kLoadSucceeded or kLoadFailed. + if (state_ == kLoadFailed) { // this can happen due to + // a file listed in an scp file not existing, or + // read failure, failure of a command, etc. + if (orig_state == kHaveScpLine) + KALDI_ERR << "TableReader: failed to load object from " + << PrintableRxfilename(data_rxfilename_) + << " (to suppress this error, add the permissive " + << "(p, ) option to the rspecifier."; + + else // orig_state_ was kLoadFailed, which only could have happened + // if the user called FreeCurrent(). + KALDI_ERR << "TableReader: you called Value() after FreeCurrent()."; + } else if (state_ != kLoadSucceeded) { + // This would be a coding error. + KALDI_ERR << "TableReader: Value() called at the wrong time."; + } + return holder_.Value(); + } + void FreeCurrent() { + if (state_ == kLoadSucceeded) { + holder_.Clear(); + state_ = kLoadFailed; + } else { + KALDI_WARN << "TableReader: FreeCurrent called at the wrong time."; + } + } + void Next() { + while (1) { + NextScpLine(); + if (Done()) return; + if (opts_.permissive) { + // Permissive mode means, when reading scp files, we treat keys whose scp entry + // cannot be read as nonexistent. This means trying to read. + if (LoadCurrent()) return; // Success. + // else try the next scp line. + } else { + return; // We go the next key; Value() will crash if we can't + // read the scp line. + } + } + } + + virtual bool Close() { + // Close() will succeed if the stream was not in an error + // state. To clean up, it also closes the Input objects if + // they're open. + if (script_input_.IsOpen()) + script_input_.Close(); + if (data_input_.IsOpen()) + data_input_.Close(); + if (state_ == kLoadSucceeded) + holder_.Clear(); + if (!this->IsOpen()) + KALDI_ERR << "Close() called on input that was not open."; + StateType old_state = state_; + state_ = kUninitialized; + if (old_state == kError) { + if (opts_.permissive) { + KALDI_WARN << "Close() called on scp file with read error, ignoring the " + "error because permissive mode specified."; + return true; + } else return false; // User will do something with the error status. + } else return true; + } + + virtual ~SequentialTableReaderScriptImpl() { + if (state_ == kError) + KALDI_ERR << "TableReader: reading script file failed: from scp " + << PrintableRxfilename(script_rxfilename_); + // If you don't want this exception to be thrown you can + // call Close() and check the status. + if (state_ == kLoadSucceeded) + holder_.Clear(); + } + private: + bool LoadCurrent() { + // Attempts to load object whose rxfilename is on the current scp line. + if (state_ != kHaveScpLine) + KALDI_ERR << "TableReader: LoadCurrent() called at the wrong time."; + bool ans; + // note, NULL means it doesn't read the binary-mode header + if (Holder::IsReadInBinary()) ans = data_input_.Open(data_rxfilename_, NULL); + else ans = data_input_.OpenTextMode(data_rxfilename_); + if (!ans) { + // May want to make this warning a VLOG at some point + KALDI_WARN << "TableReader: failed to open file " + << PrintableRxfilename(data_rxfilename_); + state_ = kLoadFailed; + return false; + } else { + if (holder_.Read(data_input_.Stream())) { + state_ = kLoadSucceeded; + return true; + } else { // holder_ will not contain data. + KALDI_WARN << "TableReader: failed to load object from " + << PrintableRxfilename(data_rxfilename_); + state_ = kLoadFailed; + return false; + } + } + } + + // Reads the next line in the script file. + void NextScpLine() { + switch (state_) { + case kLoadSucceeded: holder_.Clear(); break; + case kHaveScpLine: case kLoadFailed: case kFileStart: break; + default: + // No other states are valid to call Next() from. + KALDI_ERR << "Reading script file: Next called wrongly."; + } + std::string line; + if (getline(script_input_.Stream(), line)) { + SplitStringOnFirstSpace(line, &key_, &data_rxfilename_); + if (!key_.empty() && !data_rxfilename_.empty()) { + // Got a valid line. + state_ = kHaveScpLine; + } else { + // Got an invalid line. + state_ = kError; // we can't make sense of this + // scp file and will now die. + } + } else { + state_ = kEof; // nothing more in the scp file. + // Might as well close the input streams as don't need them. + script_input_.Close(); + if (data_input_.IsOpen()) + data_input_.Close(); + } + } + + + Input script_input_; // Input object for the .scp file + Input data_input_; // Input object for the entries in + // the script file. + Holder holder_; // Holds the object. + bool binary_; // Binary-mode archive. + std::string key_; + std::string rspecifier_; + std::string script_rxfilename_; // of the script file. + RspecifierOptions opts_; // options. + std::string data_rxfilename_; // of the file we're reading. + enum StateType { + // [The state of the reading process] [does holder_ [is script_inp_ + // have object] open] + kUninitialized, // Uninitialized or closed. no no + kEof, // We did Next() and found eof in script file. no no + kError, // Some other error no yes + kHaveScpLine, // Just called Open() or Next() and have a no yes + // line of the script file but no data. + kLoadSucceeded, // Called LoadCurrent() and it succeeded. yes yes + kLoadFailed, // Called LoadCurrent() and it failed, no yes + // or the user called FreeCurrent().. note, + // if when called by user we are in this state, + // it means the user called FreeCurrent(). + kFileStart, // [state we only use internally] no yes + } state_; + private: +}; + + +// This is the implementation for SequentialTableReader +// when it's an archive. Note that the archive format is: +// key1 [space] object1 key2 [space] +// object2 ... eof. +// "object1" is the output of the Holder::Write function and will +// typically contain a binary header (in binary mode) and then +// the output of object.Write(os, binary). +// The archive itself does not care whether it is in binary +// or text mode, for reading purposes. + +template class SequentialTableReaderArchiveImpl: + public SequentialTableReaderImplBase { + public: + typedef typename Holder::T T; + + SequentialTableReaderArchiveImpl(): state_(kUninitialized) { } + + virtual bool Open(const std::string &rspecifier) { + if (state_ != kUninitialized) { + if (! Close()) { // call Close() yourself to suppress this exception. + if (opts_.permissive) + KALDI_WARN << "TableReader::Open, error closing previous input " + "(only warning, since permissive mode)."; + else + KALDI_ERR << "TableReader::Open, error closing previous input."; + } + } + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, + &archive_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kArchiveRspecifier); + + bool ans; + // NULL means don't expect binary-mode header + if (Holder::IsReadInBinary()) + ans = input_.Open(archive_rxfilename_, NULL); + else + ans = input_.OpenTextMode(archive_rxfilename_); + if (!ans) { // header. + KALDI_WARN << "TableReader: failed to open stream " + << PrintableRxfilename(archive_rxfilename_); + state_ = kUninitialized; // Failure on Open + return false; // User should print the error message. + } + state_ = kFileStart; + Next(); + if (state_ == kError) { + KALDI_WARN << "Error beginning to read archive file (wrong filename?): " + << PrintableRxfilename(archive_rxfilename_); + input_.Close(); + state_ = kUninitialized; + return false; + } + KALDI_ASSERT(state_ == kHaveObject || state_ == kEof); + return true; + } + + virtual void Next() { + switch (state_) { + case kHaveObject: + holder_.Clear(); break; + case kFileStart: case kFreedObject: + break; + default: + KALDI_ERR << "TableReader: Next() called wrongly."; + } + std::istream &is = input_.Stream(); + is.clear(); // Clear any fail bits that may have been set... just in case + // this happened in the Read function. + is >> key_; // This eats up any leading whitespace and gets the string. + if (is.eof()) { + state_ = kEof; + return; + } + if (is.fail()) { // This shouldn't really happen, barring file-system errors. + KALDI_WARN << "Error reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + int c; + if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') { // We expect a space ' ' after the key. + // We also allow tab [which is consumed] and newline [which is not], just + // so we can read archives generated by scripts that may not be fully + // aware of how this format works. + KALDI_WARN << "Invalid archive file format: expected space after key " + << key_ << ", got character " + << CharToString(static_cast(is.peek())) << ", reading " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + if (c != '\n') is.get(); // Consume the space or tab. + if (holder_.Read(is)) { + state_ = kHaveObject; + return; + } else { + KALDI_WARN << "Object read failed, reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kEof: case kError: case kHaveObject: case kFreedObject: return true; + case kUninitialized: return false; + default: KALDI_ERR << "IsOpen() called on invalid object."; // kFileStart is not valid + // state for user to call something on. + return false; + } + } + + virtual bool Done() const { + switch (state_) { + case kHaveObject: + return false; + case kEof: case kError: + return true; // Error-state counts as Done(), but destructor + // will fail (unless you check the status with Close()). + default: + KALDI_ERR << "Done() called on TableReader object at the wrong time."; + return false; + } + } + + virtual std::string Key() { + // Valid to call this whenever Done() returns false + switch (state_) { + case kHaveObject: break; // only valid case. + default: + // coding error. + KALDI_ERR << "Key() called on TableReader object at the wrong time."; + } + return key_; + } + const T &Value() { + switch (state_) { + case kHaveObject: + break; // only valid case. + default: + // coding error. + KALDI_ERR << "Value() called on TableReader object at the wrong time."; + } + return holder_.Value(); + } + virtual void FreeCurrent() { + if (state_ == kHaveObject) { + holder_.Clear(); + state_ = kFreedObject; + } else + KALDI_WARN << "TableReader: FreeCurernt called at the wrong time."; + } + + virtual bool Close() { + if (! this->IsOpen()) + KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly."; + if (input_.IsOpen()) + input_.Close(); + if (state_ == kHaveObject) + holder_.Clear(); + bool ans; + if (opts_.permissive) { + ans = true; // always return success. + if (state_ == kError) + KALDI_WARN << "Error detected closing TableReader for archive " + << PrintableRxfilename(archive_rxfilename_) << " but ignoring " + << "it as permissive mode specified."; + } else + ans = (state_ != kError); // If error state, user should detect it. + state_ = kUninitialized; + return ans; + } + + virtual ~SequentialTableReaderArchiveImpl() { + if (state_ == kError) { + if (opts_.permissive) + KALDI_WARN << "Error detected closing TableReader for archive " + << PrintableRxfilename(archive_rxfilename_) << " but ignoring " + << "it as permissive mode specified."; + else + KALDI_ERR << "TableReader: error detected closing archive " + << PrintableRxfilename(archive_rxfilename_); + } + // If you don't want this exception to be thrown you can + // call Close() and check the status. + if (state_ == kHaveObject) + holder_.Clear(); + } + private: + Input input_; // Input object for the archive + Holder holder_; // Holds the object. + std::string key_; + std::string rspecifier_; + std::string archive_rxfilename_; + RspecifierOptions opts_; + enum { // [The state of the reading process] [does holder_ [is input_ + // have object] open] + kUninitialized, // Uninitialized or closed. no no + kFileStart, // [state we use internally: just opened.] no yes + kEof, // We did Next() and found eof in archive no no + kError, // Some other error no no + kHaveObject, // We read the key and the object after it. yes yes + kFreedObject, // The user called FreeCurrent(). no yes + } state_; +}; + + +template +SequentialTableReader::SequentialTableReader(const std::string &rspecifier): impl_(NULL) { + if (rspecifier != "" && !Open(rspecifier)) + KALDI_ERR << "Error constructing TableReader: rspecifier is " << rspecifier; +} + +template +bool SequentialTableReader::Open(const std::string &rspecifier) { + if (IsOpen()) + if (!Close()) + KALDI_ERR << "Could not close previously open object."; + // now impl_ will be NULL. + + RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, NULL); + switch (wt) { + case kArchiveRspecifier: + impl_ = new SequentialTableReaderArchiveImpl(); + break; + case kScriptRspecifier: + impl_ = new SequentialTableReaderScriptImpl(); + break; + case kNoRspecifier: default: + KALDI_WARN << "Invalid rspecifier " << rspecifier; + return false; + } + if (!impl_->Open(rspecifier)) { + delete impl_; + impl_ = NULL; + return false; // sub-object will have printed warnings. + } + else return true; +} + +template +bool SequentialTableReader::Close() { + CheckImpl(); + bool ans = impl_->Close(); + delete impl_; // We don't keep around empty impl_ objects. + impl_ = NULL; + return ans; +} + + +template +bool SequentialTableReader::IsOpen() const { + return (impl_ != NULL); // Because we delete the object whenever + // that object is not open. Thus, the IsOpen functions of the + // Impl objects are not really needed. +} + +template +std::string SequentialTableReader::Key() { + CheckImpl(); + return impl_->Key(); // this call may throw if called wrongly in other ways, + // e.g. eof. +} + + +template +void SequentialTableReader::FreeCurrent() { + CheckImpl(); + impl_->FreeCurrent(); +} + + +template +const typename SequentialTableReader::T & +SequentialTableReader::Value() { + CheckImpl(); + return impl_->Value(); // This may throw (if LoadCurrent() returned false you are safe.). +} + + +template +void SequentialTableReader::Next() { + CheckImpl(); + impl_->Next(); +} + +template +bool SequentialTableReader::Done() { + CheckImpl(); + return impl_->Done(); +} + + +template +SequentialTableReader::~SequentialTableReader() { + if (impl_) delete impl_; + // Destructor of impl_ may throw. +} + + + +template class TableWriterImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &wspecifier) = 0; + + // Write returns true on success, false on failure, but + // some errors may not be detected until we call Close(). + // It throws (via KALDI_ERR) if called wrongly. We could + // have just thrown on all errors, since this is what + // TableWriter does; it was designed this way because originally + // TableWriter::Write returned an exit status. + virtual bool Write(const std::string &key, const T &value) = 0; + + // Flush will flush any archive; it does not return error status, + // any errors will be reported on the next Write or Close. + virtual void Flush() = 0; + + virtual bool Close() = 0; + + virtual bool IsOpen() const = 0; + + // May throw on write error if Close was not called. + virtual ~TableWriterImplBase() { } + + TableWriterImplBase() { } + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(TableWriterImplBase); +}; + + +// The implementation of TableWriter we use when writing directly +// to an archive with no associated scp. +template +class TableWriterArchiveImpl: public TableWriterImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &wspecifier) { + switch (state_) { + case kUninitialized: + break; + case kWriteError: + KALDI_ERR << "TableWriter: opening stream, already open with write error."; + case kOpen: default: + if (!Close()) // throw because this error may not have been previously + // detected by the user. + KALDI_ERR << "TableWriter: opening stream, error closing previously open stream."; + } + wspecifier_ = wspecifier; + WspecifierType ws = ClassifyWspecifier(wspecifier, + &archive_wxfilename_, + NULL, + &opts_); + KALDI_ASSERT(ws == kArchiveWspecifier); // or wrongly called. + + if (output_.Open(archive_wxfilename_, opts_.binary, false)) { // false means no binary header. + state_ = kOpen; + return true; + } else { + // stream will not be open. User will report this error + // (we return bool), so don't bother printing anything. + state_ = kUninitialized; + return false; + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kUninitialized: return false; + case kOpen: case kWriteError: return true; + default: KALDI_ERR << "IsOpen() called on TableWriter in invalid state."; + } + return false; + } + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual bool Write(const std::string &key, const T &value) { + switch (state_) { + case kOpen: break; + case kWriteError: + // user should have known from the last + // call to Write that there was a problem. + KALDI_WARN << "TableWriter: attempting to write to invalid stream."; + return false; + case kUninitialized: default: + KALDI_ERR << "TableWriter: Write called on invalid stream"; + + } + // state is now kOpen or kWriteError. + if (!IsToken(key)) // e.g. empty string or has spaces... + KALDI_ERR << "TableWriter: using invalid key " << key; + output_.Stream() << key << ' '; + if (!Holder::Write(output_.Stream(), opts_.binary, value)) { + KALDI_WARN << "TableWriter: write failure to " + << PrintableWxfilename(archive_wxfilename_); + state_ = kWriteError; + return false; + } + if (state_ == kWriteError) return false; // Even if this Write seems to have + // succeeded, we fail because a previous Write failed and the archive may be + // corrupted and unreadable. + + if (opts_.flush) + Flush(); + return true; + } + + // Flush will flush any archive; it does not return error status, + // any errors will be reported on the next Write or Close. + virtual void Flush() { + switch (state_) { + case kWriteError: case kOpen: + output_.Stream().flush(); // Don't check error status. + return; + default: + KALDI_WARN << "TableWriter: Flush called on not-open writer."; + } + } + + virtual bool Close() { + if (!this->IsOpen() || !output_.IsOpen()) + KALDI_ERR << "TableWriter: Close called on a stream that was not open." << this->IsOpen() << ", " << output_.IsOpen(); + bool close_success = output_.Close(); + if (!close_success) { + KALDI_WARN << "TableWriter: error closing stream: wspecifier is " + << wspecifier_; + state_ = kUninitialized; + return false; + } + if (state_ == kWriteError) { + KALDI_WARN << "TableWriter: closing writer in error state: wspecifier is " + << wspecifier_; + state_ = kUninitialized; + return false; + } + state_ = kUninitialized; + return true; + } + + TableWriterArchiveImpl(): state_(kUninitialized) {} + + // May throw on write error if Close was not called. + virtual ~TableWriterArchiveImpl() { + if (!IsOpen()) return; + else if (!Close()) + KALDI_ERR << "At TableWriter destructor: Write failed or stream close " + << "failed: wspecifier is "<< wspecifier_; + } + + private: + Output output_; + WspecifierOptions opts_; + std::string wspecifier_; + std::string archive_wxfilename_; + enum { // is stream open? + kUninitialized, // no + kOpen, // yes + kWriteError, // yes + } state_; +}; + + + + +// The implementation of TableWriter we use when writing to +// individual files (more generally, wxfilenames) specified +// in an scp file that we read. + +// Note: the code for this class is similar to RandomAccessTableReaderScriptImpl; +// try to keep them in sync. + +template +class TableWriterScriptImpl: public TableWriterImplBase { + public: + typedef typename Holder::T T; + + TableWriterScriptImpl(): last_found_(0), state_(kUninitialized) {} + + virtual bool Open(const std::string &wspecifier) { + switch (state_) { + case kReadScript: + KALDI_ERR << " Opening already open TableWriter: call Close first."; + case kUninitialized: case kNotReadScript: + break; + } + wspecifier_ = wspecifier; + WspecifierType ws = ClassifyWspecifier(wspecifier, + NULL, + &script_rxfilename_, + &opts_); + KALDI_ASSERT(ws == kScriptWspecifier); // or wrongly called. + KALDI_ASSERT(script_.empty()); // no way it could be nonempty at this point. + + if (! ReadScriptFile(script_rxfilename_, + true, // print any warnings + &script_)) { // error reading script file or invalid format + state_ = kNotReadScript; + return false; // no need to print further warnings. user gets the error. + } + std::sort(script_.begin(), script_.end()); + for (size_t i = 0; i+1 < script_.size(); i++) { + if (script_[i].first.compare(script_[i+1].first) >= 0) { + // script[i] not < script[i+1] in lexical order... + KALDI_WARN << "Script file " << PrintableRxfilename(script_rxfilename_) + << " contains duplicate key " << script_[i].first; + state_ = kNotReadScript; + return false; + } + } + state_ = kReadScript; + return true; + } + + virtual bool IsOpen() const { return (state_ == kReadScript); } + + virtual bool Close() { + if (!IsOpen()) + KALDI_ERR << "Close() called on TableWriter that was not open."; + state_ = kUninitialized; + last_found_ = 0; + script_.clear(); + return true; + } + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual bool Write(const std::string &key, const T &value) { + if (!IsOpen()) + KALDI_ERR << "TableWriter: Write called on invalid stream"; + + if (!IsToken(key)) // e.g. empty string or has spaces... + KALDI_ERR << "TableWriter: using invalid key " << key; + + std::string wxfilename; + if (!LookupFilename(key, &wxfilename)) { + if (opts_.permissive) { + return true; // In permissive mode, it's as if we're writing to /dev/null + // for missing keys. + } else { + KALDI_WARN << "TableWriter: script file " + << PrintableRxfilename(script_rxfilename_) + << " has no entry for key "< pr(key, ""); // Important that "" + // compares less than or equal to any string, so lower_bound points to the + // element that has the same key. + typedef typename std::vector >::const_iterator + IterType; + IterType iter = std::lower_bound(script_.begin(), script_.end(), pr); + if (iter != script_.end() && iter->first == key) { + last_found_ = iter - script_.begin(); + *wxfilename = iter->second; + return true; + } else { + return false; + } + } + + + WspecifierOptions opts_; + std::string wspecifier_; + std::string script_rxfilename_; + + // the script_ variable contains pairs of (key, filename), sorted using + // std::sort. This can be used with binary_search to look up filenames for + // writing. If this becomes inefficient we can use std::unordered_map (but I + // suspect this wouldn't be significantly faster & would use more memory). + // If memory becomes a problem here, the user should probably be passing + // only the relevant part of the scp file rather than expecting us to get too + // clever in the code. + std::vector > script_; + size_t last_found_; // This is for an optimization used in LookupFilename. + + enum { + kUninitialized, + kReadScript, + kNotReadScript, // read of script failed. + } state_; +}; + + +// The implementation of TableWriter we use when writing directly +// to an archive plus an associated scp. +template +class TableWriterBothImpl: public TableWriterImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &wspecifier) { + switch (state_) { + case kUninitialized: + break; + case kWriteError: + KALDI_ERR << "TableWriter: opening stream, already open with write error."; + case kOpen: default: + if (!Close()) // throw because this error may not have been previously detected by user. + KALDI_ERR << "TableWriter: opening stream, error closing previously open stream."; + } + wspecifier_ = wspecifier; + WspecifierType ws = ClassifyWspecifier(wspecifier, + &archive_wxfilename_, + &script_wxfilename_, + &opts_); + KALDI_ASSERT(ws == kBothWspecifier); // or wrongly called. + if (ClassifyWxfilename(archive_wxfilename_) != kFileOutput) + KALDI_WARN << "When writing to both archive and script, the script file " + "will generally not be interpreted correctly unless the archive is " + "an actual file: wspecifier = " << wspecifier; + + if (!archive_output_.Open(archive_wxfilename_, opts_.binary, false)) { // false means no binary header. + state_ = kUninitialized; + return false; + } + if (!script_output_.Open(script_wxfilename_, false, false)) { // first false means text mode: + // script files always text-mode. second false means don't write header (doesn't matter + // for text mode). + archive_output_.Close(); // Don't care about status: error anyway. + state_ = kUninitialized; + return false; + } + state_ = kOpen; + return true; + } + + virtual bool IsOpen() const { + switch (state_) { + case kUninitialized: return false; + case kOpen: case kWriteError: return true; + default: KALDI_ERR << "IsOpen() called on TableWriter in invalid state."; + } + return false; + } + + void MakeFilename(typename std::ostream::pos_type streampos, std::string *output) const { + std::ostringstream ss; + ss << ':' << streampos; + KALDI_ASSERT(ss.str() != ":-1"); + *output = archive_wxfilename_ + ss.str(); + + // e.g. /some/file:12302. + // Note that we warned if archive_wxfilename_ is not an actual filename; + // the philosophy is we give the user rope and if they want to hang + // themselves, with it, fine. + } + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual bool Write(const std::string &key, const T &value) { + switch (state_) { + case kOpen: break; + case kWriteError: + // user should have known from the last + // call to Write that there was a problem. Warn about it. + KALDI_WARN << "TableWriter: writing to non-open TableWriter object."; + return false; + case kUninitialized: default: + KALDI_ERR << "TableWriter: Write called on invalid stream"; + } + // state is now kOpen or kWriteError. + if (!IsToken(key)) // e.g. empty string or has spaces... + KALDI_ERR << "TableWriter: using invalid key " << key; + std::ostream &archive_os = archive_output_.Stream(); + archive_os << key << ' '; + typename std::ostream::pos_type archive_os_pos = archive_os.tellp(); + // position at start of Write() to archive. We will record this in the script file. + std::string offset_rxfilename; // rxfilename with offset into the archive, + // e.g. some_archive_name.ark:431541423 + MakeFilename(archive_os_pos, &offset_rxfilename); + + // Write to the script file first. + // The idea is that we want to get all the information possible into the + // script file, to make it easier to unwind errors later. + std::ostream &script_os = script_output_.Stream(); + script_output_.Stream() << key << ' ' << offset_rxfilename << '\n'; + + if (!Holder::Write(archive_output_.Stream(), opts_.binary, value)) { + KALDI_WARN << "TableWriter: write failure to" + << PrintableWxfilename(archive_wxfilename_); + state_ = kWriteError; + return false; + } + + if (script_os.fail()) { + KALDI_WARN << "TableWriter: write failure to script file detected: " + << PrintableWxfilename(script_wxfilename_); + state_ = kWriteError; + return false; + } + + if (archive_os.fail()) { + KALDI_WARN << "TableWriter: write failure to archive file detected: " + << PrintableWxfilename(archive_wxfilename_); + state_ = kWriteError; + return false; + } + + if (state_ == kWriteError) return false; // Even if this Write seems to have + // succeeded, we fail because a previous Write failed and the archive may be + // corrupted and unreadable. + + if (opts_.flush) + Flush(); + return true; + } + + // Flush will flush any archive; it does not return error status, + // any errors will be reported on the next Write or Close. + virtual void Flush() { + switch (state_) { + case kWriteError: case kOpen: + archive_output_.Stream().flush(); // Don't check error status. + script_output_.Stream().flush(); // Don't check error status. + return; + default: + KALDI_WARN << "TableWriter: Flush called on not-open writer."; + } + } + + virtual bool Close() { + if (!this->IsOpen()) + KALDI_ERR << "TableWriter: Close called on a stream that was not open."; + bool close_success = true; + if (archive_output_.IsOpen()) + if (!archive_output_.Close()) close_success = false; + if (script_output_.IsOpen()) + if (!script_output_.Close()) close_success = false; + bool ans = close_success && (state_ != kWriteError); + state_ = kUninitialized; + return ans; + } + + TableWriterBothImpl(): state_(kUninitialized) {} + + // May throw on write error if Close() was not called. + // User can get the error status by calling Close(). + virtual ~TableWriterBothImpl() { + if (!IsOpen()) return; + else if (!Close()) + KALDI_ERR << "At TableWriter destructor: Write failed or stream close failed: " + << wspecifier_; + } + + private: + Output archive_output_; + Output script_output_; + WspecifierOptions opts_; + std::string archive_wxfilename_; + std::string script_wxfilename_; + std::string wspecifier_; + enum { // is stream open? + kUninitialized, // no + kOpen, // yes + kWriteError, // yes + } state_; +}; + + +template +TableWriter::TableWriter(const std::string &wspecifier): impl_(NULL) { + if (wspecifier != "" && !Open(wspecifier)) { + KALDI_ERR << "TableWriter: failed to write to " + << wspecifier; + } +} + +template +bool TableWriter::IsOpen() const { + return (impl_ != NULL); +} + + +template +bool TableWriter::Open(const std::string &wspecifier) { + + if (IsOpen()) { + if (!Close()) // call Close() yourself to suppress this exception. + KALDI_ERR << "TableWriter::Open, failed to close previously open writer."; + } + KALDI_ASSERT(impl_ == NULL); + WspecifierType wtype = ClassifyWspecifier(wspecifier, NULL, NULL, NULL); + switch (wtype) { + case kBothWspecifier: + impl_ = new TableWriterBothImpl(); + break; + case kArchiveWspecifier: + impl_ = new TableWriterArchiveImpl(); + break; + case kScriptWspecifier: + impl_ = new TableWriterScriptImpl(); + break; + case kNoWspecifier: default: + KALDI_WARN << "ClassifyWspecifier: invalid wspecifier " << wspecifier; + return false; + } + if (impl_->Open(wspecifier)) return true; + else { // The class will have printed a more specific warning. + delete impl_; + impl_ = NULL; + return false; + } +} + +template +void TableWriter::Write(const std::string &key, + const T &value) const { + CheckImpl(); + if (!impl_->Write(key, value)) + KALDI_ERR << "Error in TableWriter::Write"; + // More specific warning will have + // been printed in the Write function. +} + +template +void TableWriter::Flush() { + CheckImpl(); + impl_->Flush(); +} + +template +bool TableWriter::Close() { + CheckImpl(); + bool ans = impl_->Close(); + delete impl_; // We don't keep around non-open impl_ objects [c.f. definition of IsOpen()] + impl_ = NULL; + return ans; +} + +template +TableWriter::~TableWriter() { + if (IsOpen() && !Close()) { + KALDI_ERR << "Error closing TableWriter [in destructor]."; + } +} + + +// Types of RandomAccessTableReader: +// In principle, we would like to have four types of RandomAccessTableReader: +// the 4 combinations [scp, archive], [seekable, not-seekable], +// where if something is seekable we only store a file offset. However, +// it seems sufficient for now to only implement two of these, in both +// cases assuming it's not seekable so we never store file offsets and always +// store either the scp line or the data in the archive. The reasons are: +// (1) +// For scp files, storing the actual entry is not that much more expensive +// than storing the file offsets (since the entries are just filenames), and +// avoids a lot of fseek operations that might be expensive. +// (2) +// For archive files, there is no real reason, if you have the archive file +// on disk somewhere, why you wouldn't access it via its associated scp. +// [i.e. write it as ark, scp]. The main reason to read archives directly +// is if they are part of a pipe, and in this case it's not seekable, so +// we implement only this case. +// +// Note that we will rarely in practice have to keep in memory everything in +// the archive, as long as things are only read once from the archive (the +// "o, " or "once" option) and as long as we keep our keys in sorted order; to take +// advantage of this we need the "s, " (sorted) option, so we would read archives +// as e.g. "s, o, ark:-" (this is the rspecifier we would use if it was the +// standard input and these conditions held). + +template class RandomAccessTableReaderImplBase { + public: + typedef typename Holder::T T; + + virtual bool Open(const std::string &rspecifier) = 0; + + virtual bool HasKey(const std::string &key) = 0; + + virtual const T &Value(const std::string &key) = 0; + + virtual bool Close() = 0; + + virtual ~RandomAccessTableReaderImplBase() {} +}; + + +// Implementation of RandomAccessTableReader for a script file; for simplicity we +// just read it in all in one go, as it's unlikely someone would generate this +// from a pipe. In principle we could read it on-demand as for the archives, but +// this would probably be overkill. + +// Note: the code for this this class is similar to TableWriterScriptImpl: +// try to keep them in sync. +template +class RandomAccessTableReaderScriptImpl: + public RandomAccessTableReaderImplBase { + + public: + typedef typename Holder::T T; + + RandomAccessTableReaderScriptImpl(): last_found_(0), state_(kUninitialized) {} + + virtual bool Open(const std::string &rspecifier) { + switch (state_) { + case kNotHaveObject: case kHaveObject: case kGaveObject: + KALDI_ERR << " Opening already open RandomAccessTableReader: call Close first."; + case kUninitialized: case kNotReadScript: + break; + } + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, + &script_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kScriptRspecifier); // or wrongly called. + KALDI_ASSERT(script_.empty()); // no way it could be nonempty at this point. + + if (! ReadScriptFile(script_rxfilename_, + true, // print any warnings + &script_)) { // error reading script file or invalid format + state_ = kNotReadScript; + return false; // no need to print further warnings. user gets the error. + } + + rspecifier_ = rspecifier; + // If opts_.sorted, the user has asserted that the keys are already sorted. + // Although we could easily sort them, we want to let the user know of this + // mistake. This same mistake could have serious effects if used with an + // archive rather than a script. + if (!opts_.sorted) + std::sort(script_.begin(), script_.end()); + for (size_t i = 0; i+1 < script_.size(); i++) { + if (script_[i].first.compare(script_[i+1].first) >= 0) { + // script[i] not < script[i+1] in lexical order... + bool same = (script_[i].first == script_[i+1].first); + KALDI_WARN << "Script file " << PrintableRxfilename(script_rxfilename_) + << (same ? " contains duplicate key: " : + " is not sorted (remove s, option or add ns, option): key is ") + << script_[i].first; + state_ = kNotReadScript; + return false; + } + } + state_ = kNotHaveObject; + return true; + } + + virtual bool IsOpen() const { + return (state_ == kNotHaveObject || state_ == kHaveObject || + state_ == kGaveObject); + } + + virtual bool Close() { + if (!IsOpen()) + KALDI_ERR << "Close() called on RandomAccessTableReader that was not open."; + holder_.Clear(); + state_ = kUninitialized; + last_found_ = 0; + script_.clear(); + current_key_ = ""; + // This one cannot fail because any errors of a "global" + // nature would have been detected when we did Open(). + // With archives it's different. + return true; + } + + virtual bool HasKey(const std::string &key) { + bool preload = opts_.permissive; + // In permissive mode, we have to check that we can read + // the scp entry before we assert that the key is there. + return HasKeyInternal(key, preload); + } + + + // Write returns true on success, false on failure, but + // some errors may not be detected till we call Close(). + virtual const T& Value(const std::string &key) { + + if (!IsOpen()) + KALDI_ERR << "Value() called on non-open object."; + + if (!((state_ == kHaveObject || state_ == kGaveObject) + && key == current_key_)) { // Not already stored... + bool has_key = HasKeyInternal(key, true); // preload. + if (!has_key) + KALDI_ERR << "Could not get item for key " << key + << ", rspecifier is " << rspecifier_ << "[to ignore this, " + << "add the p, (permissive) option to the rspecifier."; + KALDI_ASSERT(state_ == kHaveObject && key == current_key_); + } + + if (state_ == kHaveObject) { + state_ = kGaveObject; + if (opts_.once) MakeTombstone(key); // make sure that future lookups fail. + return holder_.Value(); + } else { // state_ == kGaveObject + if (opts_.once) + KALDI_ERR << "Value called twice for the same key and ,o (once) option " + << "is used: rspecifier is " << rspecifier_; + return holder_.Value(); + } + } + + virtual ~RandomAccessTableReaderScriptImpl() { + if (state_ == kHaveObject || state_ == kGaveObject) + holder_.Clear(); + } + + private: + // HasKeyInternal when called with preload == false just tells us whether the + // key is in the scp. With preload == true, which happens when the ,p + // (permissive) option is given in the rspecifier, it will also check that we + // can preload the object from disk (loading from the rxfilename in the scp), + // and only return true if we can. This function is called both from HasKey + // and from Value(). + virtual bool HasKeyInternal(const std::string &key, bool preload) { + switch (state_) { + case kUninitialized: case kNotReadScript: + KALDI_ERR << "HasKey called on RandomAccessTableReader object that is not open."; + case kHaveObject: case kGaveObject: + if (key == current_key_) + return true; + break; + default: break; + } + KALDI_ASSERT(IsToken(key)); + size_t key_pos = 0; // set to zero to suppress warning + bool ans = LookupKey(key, &key_pos); + if (!ans) return false; + else { + // First do a check regarding the "once" option. + if (opts_.once && script_[key_pos].second == "") { // A "tombstone"; user is asking about + // already-read key. + KALDI_ERR << "HasKey called on key whose value was already read, and " + " you specified the \"once\" option (o, ): try removing o, or adding no, :" + " rspecifier is " << rspecifier_; + } + if (!preload) + return true; // we have the key. + else { // preload specified, so we have to pre-load the object before returning true. + if (!input_.Open(script_[key_pos].second)) { + KALDI_WARN << "Error opening stream " + << PrintableRxfilename(script_[key_pos].second); + return false; + } else { + // Make sure holder empty. + if (state_ == kHaveObject || state_ == kGaveObject) + holder_.Clear(); + if (holder_.Read(input_.Stream())) { + state_ = kHaveObject; + current_key_ = key; + return true; + } else { + KALDI_WARN << "Error reading object from " + "stream " << PrintableRxfilename(script_[key_pos].second); + state_ = kNotHaveObject; + return false; + } + } + } + } + } + void MakeTombstone(const std::string &key) { + size_t offset; + if (!LookupKey(key, &offset)) + KALDI_ERR << "RandomAccessTableReader object in inconsistent state."; + else + script_[offset].second = ""; + } + bool LookupKey(const std::string &key, size_t *script_offset) { + // First, an optimization: if we're going consecutively, this will + // make the lookup very fast. Since we may call HasKey and then + // Value(), which both may look up the key, we test if either the + // current or next position are correct. + if (last_found_ < script_.size() && script_[last_found_].first == key) { + *script_offset = last_found_; + return true; + } + last_found_++; + if (last_found_ < script_.size() && script_[last_found_].first == key) { + *script_offset = last_found_; + return true; + } + std::pair pr(key, ""); // Important that "" + // compares less than or equal to any string, so lower_bound points to the + // element that has the same key. + typedef typename std::vector >::const_iterator + IterType; + IterType iter = std::lower_bound(script_.begin(), script_.end(), pr); + if (iter != script_.end() && iter->first == key) { + last_found_ = *script_offset = iter - script_.begin(); + return true; + } else { + return false; + } + } + + + Input input_; // Use the same input_ object for reading each file, in case + // the scp specifies offsets in an archive (so we can keep the same file open). + RspecifierOptions opts_; + std::string rspecifier_; // rspecifier used to open it; used in debug messages + std::string script_rxfilename_; // filename of script. + + std::string current_key_; // Key of object in holder_ + Holder holder_; + + // the script_ variable contains pairs of (key, filename), sorted using + // std::sort. This can be used with binary_search to look up filenames for + // writing. If this becomes inefficient we can use std::unordered_map (but I + // suspect this wouldn't be significantly faster & would use more memory). + // If memory becomes a problem here, the user should probably be passing + // only the relevant part of the scp file rather than expecting us to get too + // clever in the code. + std::vector > script_; + size_t last_found_; // This is for an optimization used in FindFilename. + + enum { // [Do we have [Does holder_ + // script_ set up?] contain object?] + kUninitialized, // no no + kNotReadScript, // no no + kNotHaveObject, // yes no + kHaveObject, // yes yes + kGaveObject, // yes yes + // [kGaveObject is as kHaveObject but we note that the + // user has already read it; this is for checking that + // if "once" is specified, the user actually only reads + // it once. + } state_; + +}; + + + + +// This is the base-class (with some implemented functions) for the +// implementations of RandomAccessTableReader when it's an archive. This +// base-class handles opening the files, storing the state of the reading +// process, and loading objects. This is the only case in which we have +// an intermediate class in the hierarchy between the virtual ImplBase +// class and the actual Impl classes. +// The child classes vary in the assumptions regarding sorting, etc. + +template class RandomAccessTableReaderArchiveImplBase: + public RandomAccessTableReaderImplBase { + public: + typedef typename Holder::T T; + + RandomAccessTableReaderArchiveImplBase(): holder_(NULL), state_(kUninitialized) { } + + virtual bool Open(const std::string &rspecifier) { + if (state_ != kUninitialized) { + if (! this->Close()) // call Close() yourself to suppress this exception. + KALDI_ERR << "TableReader::Open, error closing previous input."; + } + rspecifier_ = rspecifier; + RspecifierType rs = ClassifyRspecifier(rspecifier, &archive_rxfilename_, + &opts_); + KALDI_ASSERT(rs == kArchiveRspecifier); + + // NULL means don't expect binary-mode header + bool ans; + if (Holder::IsReadInBinary()) + ans = input_.Open(archive_rxfilename_, NULL); + else + ans = input_.OpenTextMode(archive_rxfilename_); + if (!ans) { // header. + KALDI_WARN << "TableReader: failed to open stream " + << PrintableRxfilename(archive_rxfilename_); + state_ = kUninitialized; // Failure on Open + return false; // User should print the error message. + } else { + state_ = kNoObject; + } + return true; + } + + // ReadNextObject() requires that the state be kNoObject, + // and it will try read the next object. If it succeeds, + // it sets the state to kHaveObject, and + // cur_key_ and holder_ have the key and value. If it fails, + // it sets the state to kError or kEof. + void ReadNextObject() { + if (state_ != kNoObject) + KALDI_ERR << "TableReader: ReadNextObject() called from wrong state."; // Code error + // somewhere in this class or a child class. + std::istream &is = input_.Stream(); + is.clear(); // Clear any fail bits that may have been set... just in case + // this happened in the Read function. + is >> cur_key_; // This eats up any leading whitespace and gets the string. + if (is.eof()) { + state_ = kEof; + return; + } + if (is.fail()) { // This shouldn't really happen, barring file-system errors. + KALDI_WARN << "Error reading archive: rspecifier is " << rspecifier_; + state_ = kError; + return; + } + int c; + if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') { // We expect a space ' ' after the key. + // We also allow tab, just so we can read archives generated by scripts that may + // not be fully aware of how this format works. + KALDI_WARN << "Invalid archive file format: expected space after key " <(is.peek())) << ", reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + return; + } + if (c != '\n') is.get(); // Consume the space or tab. + holder_ = new Holder; + if (holder_->Read(is)) { + state_ = kHaveObject; + return; + } else { + KALDI_WARN << "Object read failed, reading archive " + << PrintableRxfilename(archive_rxfilename_); + state_ = kError; + delete holder_; + holder_ = NULL; + return; + } + } + + virtual bool IsOpen() const { + switch (state_) { + case kEof: case kError: case kHaveObject: case kNoObject: return true; + case kUninitialized: return false; + default: KALDI_ERR << "IsOpen() called on invalid object."; + return false; + } + } + + // Called by the child-class virutal Close() functions; does the + // shared parts of the cleanup. + bool CloseInternal() { + if (! this->IsOpen()) + KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly."; + if (input_.IsOpen()) + input_.Close(); + if (state_ == kHaveObject) { + KALDI_ASSERT(holder_ != NULL); + delete holder_; + holder_ = NULL; + } else KALDI_ASSERT(holder_ == NULL); + bool ans = (state_ != kError); + state_ = kUninitialized; + if (!ans && opts_.permissive) { + KALDI_WARN << "Error state detected closing reader. " + << "Ignoring it because you specified permissive mode."; + return true; + } + return ans; + } + + ~RandomAccessTableReaderArchiveImplBase() { + // The child class has the responsibility to call CloseInternal(). + KALDI_ASSERT(state_ == kUninitialized && holder_ == NULL); + } + private: + Input input_; // Input object for the archive + protected: + // The variables below are accessed by child classes. + + std::string cur_key_; // current key (if state == kHaveObject). + Holder *holder_; // Holds the object we just read (if state == kHaveObject) + + std::string rspecifier_; + std::string archive_rxfilename_; + RspecifierOptions opts_; + + enum { // [The state of the reading process] [does holder_ [is input_ + // have object] open] + kUninitialized, // Uninitialized or closed no no + kNoObject, // Do not have object in holder_ no yes + kHaveObject, // Have object in holder_ yes yes + kEof, // End of file no yes + kError, // Some kind of error-state in the reading. no yes + } state_; + +}; + + +// RandomAccessTableReaderDSortedArchiveImpl (DSorted for "doubly sorted") is the +// implementation for random-access reading of archives when both the archive, +// and the calling code, are in sorted order (i.e. we ask for the keys in sorted +// order). This is when the s and cs options are both given. It only ever has +// to keep one object in memory. It inherits from +// RandomAccessTableReaderArchiveImplBase which implements the common parts of +// RandomAccessTableReader that are used when it's an archive we're reading from. + +template class RandomAccessTableReaderDSortedArchiveImpl: + public RandomAccessTableReaderArchiveImplBase { + using RandomAccessTableReaderArchiveImplBase::kUninitialized; + using RandomAccessTableReaderArchiveImplBase::kHaveObject; + using RandomAccessTableReaderArchiveImplBase::kNoObject; + using RandomAccessTableReaderArchiveImplBase::kEof; + using RandomAccessTableReaderArchiveImplBase::kError; + using RandomAccessTableReaderArchiveImplBase::state_; + using RandomAccessTableReaderArchiveImplBase::opts_; + using RandomAccessTableReaderArchiveImplBase::cur_key_; + using RandomAccessTableReaderArchiveImplBase::holder_; + using RandomAccessTableReaderArchiveImplBase::rspecifier_; + using RandomAccessTableReaderArchiveImplBase::archive_rxfilename_; + using RandomAccessTableReaderArchiveImplBase::ReadNextObject; + public: + typedef typename Holder::T T; + + RandomAccessTableReaderDSortedArchiveImpl() { } + + virtual bool Close() { + // We don't have anything additional to clean up, so just + // call generic base-class one. + return this->CloseInternal(); + } + + virtual bool HasKey(const std::string &key) { + return FindKeyInternal(key); + } + virtual const T & Value(const std::string &key) { + if (FindKeyInternal(key)) { + KALDI_ASSERT(this->state_ == kHaveObject && key == this->cur_key_ + && holder_ != NULL); + return this->holder_->Value(); + } else { + KALDI_ERR << "Value() called but no such key " << key + << " in archive " << PrintableRxfilename(archive_rxfilename_); + return *(const T*)NULL; // keep compiler happy. + } + } + + virtual ~RandomAccessTableReaderDSortedArchiveImpl() { + if (this->IsOpen()) + if (!Close()) // more specific warning will already have been printed. + // we are in some kind of error state & user did not find out by + // calling Close(). + KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is " + << rspecifier_; + } + private: + // FindKeyInternal tries to find the key by calling "ReadNextObject()" + // as many times as necessary till we get to it. It is called from + // both FindKey and Value(). + bool FindKeyInternal(const std::string &key) { + // First check that the user is calling us right: should be + // in sorted order. If not, error. + if (!last_requested_key_.empty()) { + if (key.compare(last_requested_key_) < 0) { // key < last_requested_key_ + KALDI_ERR << "You provided the \"cs\" option " + << "but are not calling with keys in sorted order: " + << key << " < " << last_requested_key_ << ": rspecifier is " + << rspecifier_; + } + } + // last_requested_key_ is just for debugging of order of calling. + last_requested_key_ = key; + + if (state_ == kNoObject) + ReadNextObject(); // This can only happen + // once, the first time someone calls HasKey() or Value(). We don't + // do it in the initializer to stop the program hanging too soon, + // if reading from a pipe. + + if (state_ == kEof || state_ == kError) return false; + + if (state_ == kUninitialized) + KALDI_ERR << "Trying to access a RandomAccessTableReader object that is not open."; + + std::string last_key_; // To check that + // the archive we're reading is in sorted order. + while (1) { + KALDI_ASSERT(state_ == kHaveObject); + int compare = key.compare(cur_key_); + if (compare == 0) { // key == key_ + return true; // we got it.. + } else if (compare < 0) { // key < cur_key_, so we already read past the + // place where we want to be. This implies that we will never find it + // [due to the sorting etc., this means it just isn't in the archive]. + return false; + } else { // compare > 0, key > cur_key_. We need to read further ahead. + last_key_ = cur_key_; + // read next object.. we have to set state to kNoObject first. + KALDI_ASSERT(holder_ != NULL); + delete holder_; + holder_ = NULL; + state_ = kNoObject; + ReadNextObject(); + if (state_ != kHaveObject) + return false; // eof or read error. + if (cur_key_.compare(last_key_) <= 0) { + KALDI_ERR << "You provided the \"s\" option " + << " (sorted order), but keys are out of order or duplicated: " + << last_key_ << " is followed by " << cur_key_ + << ": rspecifier is " << rspecifier_; + } + } + } + } + + /// Last string provided to HasKey() or Value(); + std::string last_requested_key_; + + +}; + +// RandomAccessTableReaderSortedArchiveImpl is for random-access reading of +// archives when the user specified the sorted (s) option but not the +// called-sorted (cs) options. +template class RandomAccessTableReaderSortedArchiveImpl: + public RandomAccessTableReaderArchiveImplBase { + using RandomAccessTableReaderArchiveImplBase::kUninitialized; + using RandomAccessTableReaderArchiveImplBase::kHaveObject; + using RandomAccessTableReaderArchiveImplBase::kNoObject; + using RandomAccessTableReaderArchiveImplBase::kEof; + using RandomAccessTableReaderArchiveImplBase::kError; + using RandomAccessTableReaderArchiveImplBase::state_; + using RandomAccessTableReaderArchiveImplBase::opts_; + using RandomAccessTableReaderArchiveImplBase::cur_key_; + using RandomAccessTableReaderArchiveImplBase::holder_; + using RandomAccessTableReaderArchiveImplBase::rspecifier_; + using RandomAccessTableReaderArchiveImplBase::archive_rxfilename_; + using RandomAccessTableReaderArchiveImplBase::ReadNextObject; + + public: + typedef typename Holder::T T; + + RandomAccessTableReaderSortedArchiveImpl(): + last_found_index_(static_cast(-1)), + pending_delete_(static_cast(-1)) { } + + virtual bool Close() { + for (size_t i = 0; i < seen_pairs_.size(); i++) + if (seen_pairs_[i].second) + delete seen_pairs_[i].second; + seen_pairs_.clear(); + + pending_delete_ = static_cast(-1); + last_found_index_ = static_cast(-1); + + return this->CloseInternal(); + } + virtual bool HasKey(const std::string &key) { + HandlePendingDelete(); + size_t index; + bool ans = FindKeyInternal(key, &index); + if (ans && opts_.once && seen_pairs_[index].second == NULL) { + // Just do a check RE the once option. "&&opts_.once" is for + // efficiency since this can only happen in that case. + KALDI_ERR << "Error: HasKey called after Value() already called for " + << " that key, and once (o) option specified: rspecifier is " + << rspecifier_; + } + return ans; + } + virtual const T & Value(const std::string &key) { + HandlePendingDelete(); + size_t index; + if (FindKeyInternal(key, &index)) { + if (seen_pairs_[index].second == NULL) { // can happen if opts.once_ + KALDI_ERR << "Error: Value() called more than once for key " + << key << " and once (o) option specified: rspecifier is " + << rspecifier_; + } + if (opts_.once) + pending_delete_ = index; // mark this index to be deleted on next call. + return seen_pairs_[index].second->Value(); + } else { + KALDI_ERR << "Value() called but no such key " << key + << " in archive " << PrintableRxfilename(archive_rxfilename_); + return *(const T*)NULL; // keep compiler happy. + } + } + virtual ~RandomAccessTableReaderSortedArchiveImpl() { + if (this->IsOpen()) + if (!Close()) // more specific warning will already have been printed. + // we are in some kind of error state & user did not find out by + // calling Close(). + KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is " + << rspecifier_; + } + private: + void HandlePendingDelete() { + const size_t npos = static_cast(-1); + if (pending_delete_ != npos) { + KALDI_ASSERT(pending_delete_ < seen_pairs_.size()); + KALDI_ASSERT(seen_pairs_[pending_delete_].second != NULL); + delete seen_pairs_[pending_delete_].second; + seen_pairs_[pending_delete_].second = NULL; + pending_delete_ = npos; + } + } + + // FindKeyInternal tries to find the key in the array "seen_pairs_". + // If it is not already there, it reads ahead as far as necessary + // to determine whether we have the key or not. On success it returns + // true and puts the index into the array seen_pairs_, into "index"; + // on failure it returns false. + // It will leave the state as either kNoObject, kEof or kError. + // FindKeyInternal does not do any checking about whether you are asking + // about a key that has been already given (with the "once" option). + // That is the user's responsibility. + + bool FindKeyInternal(const std::string &key, size_t *index) { + // First, an optimization in case the previous call was for the + // same key, and we found it. + if (last_found_index_ < seen_pairs_.size() + && seen_pairs_[last_found_index_].first == key) { + *index = last_found_index_; + return true; + } + + if (state_ == kUninitialized) + KALDI_ERR << "Trying to access a RandomAccessTableReader object that is not open."; + + // Step one is to see whether we have to read ahead for the object.. + // Note, the possible states right now are kNoObject, kEof or kError. + // We are never in the state kHaveObject except just after calling + // ReadNextObject(). + bool looped = false; + while (state_ == kNoObject && + (seen_pairs_.empty() || key.compare(seen_pairs_.back().first) > 0)) { + looped = true; + // Read this as: + // while ( the stream is potentially good for reading && + // ([got no keys] || key > most_recent_key) ) { ... + // Try to read a new object. + // Note that the keys in seen_pairs_ are ordered from least to greatest. + ReadNextObject(); + if (state_ == kHaveObject) { // Successfully read object. + if (!seen_pairs_.empty() && // This is just a check. + cur_key_.compare(seen_pairs_.back().first) <= 0) { + // read the expression above as: !( cur_key_ > previous_key). + // it means we are not in sorted order [the user specified that we + // are, or we would not be using this implementation]. + KALDI_ERR << "You provided the sorted (s) option but keys in archive " + << PrintableRxfilename(archive_rxfilename_) << " are not " + << "in sorted order: " << seen_pairs_.back().first + << " is followed by " << cur_key_; + } + KALDI_ASSERT(holder_ != NULL); + seen_pairs_.push_back(std::make_pair(cur_key_, holder_)); + holder_ = NULL; + state_ = kNoObject; + } + } + if (looped) { // We only need to check the last element of the seen_pairs_ array, + // since we would not have read more after getting "key". + if (!seen_pairs_.empty() && seen_pairs_.back().first == key) { + last_found_index_ = *index = seen_pairs_.size() - 1; + return true; + } else return false; + } + // Now we have do an actual binary search in the seen_pairs_ array. + std::pair pr(key, static_cast(NULL)); + typename std::vector >::iterator + iter = std::lower_bound(seen_pairs_.begin(), seen_pairs_.end(), + pr, PairCompare()); + if (iter != seen_pairs_.end() && + key == iter->first) { + last_found_index_ = *index = (iter - seen_pairs_.begin()); + return true; + } else return false; + } + + // These are the pairs of (key, object) we have read. We keep all the keys we + // have read but the actual objects (if they are stored with pointers inside + // the Holder object) may be deallocated if once == true, and the Holder + // pointer set to NULL. + std::vector > seen_pairs_; + size_t last_found_index_; // An optimization s.t. if FindKeyInternal called twice with + // same key (as it often will), it doesn't have to do the key search twice. + size_t pending_delete_; // If opts_.once == true, this is the index of + // element of seen_pairs_ that is pending deletion. + struct PairCompare { + // PairCompare is the Less-than operator for the pairs of(key, Holder). + // compares the keys. + inline bool operator() (const std::pair &pr1, + const std::pair &pr2) { + return (pr1.first.compare(pr2.first) < 0); + } + }; +}; + + + +// RandomAccessTableReaderUnsortedArchiveImpl is for random-access reading of +// archives when the user does not specify the sorted (s) option (in this case +// the called-sorted, or "cs" option, is ignored). This is the least efficient +// of the random access archive readers, in general, but it can be as efficient +// as the others, in speed, memory and latency, if the "once" option is specified +// and it happens that the keys of the archive are the same as the keys the code +// is called with (to HasKey() and Value()), and in the same order. However, if +// you ask it for a key that's not present it will have to read the archive till +// the end and store it all in memory. + +template class RandomAccessTableReaderUnsortedArchiveImpl: + public RandomAccessTableReaderArchiveImplBase { + using RandomAccessTableReaderArchiveImplBase::kUninitialized; + using RandomAccessTableReaderArchiveImplBase::kHaveObject; + using RandomAccessTableReaderArchiveImplBase::kNoObject; + using RandomAccessTableReaderArchiveImplBase::kEof; + using RandomAccessTableReaderArchiveImplBase::kError; + using RandomAccessTableReaderArchiveImplBase::state_; + using RandomAccessTableReaderArchiveImplBase::opts_; + using RandomAccessTableReaderArchiveImplBase::cur_key_; + using RandomAccessTableReaderArchiveImplBase::holder_; + using RandomAccessTableReaderArchiveImplBase::rspecifier_; + using RandomAccessTableReaderArchiveImplBase::archive_rxfilename_; + using RandomAccessTableReaderArchiveImplBase::ReadNextObject; + + typedef typename Holder::T T; + + public: + RandomAccessTableReaderUnsortedArchiveImpl(): to_delete_iter_(map_.end()), + to_delete_iter_valid_(false) + { + map_.max_load_factor(0.5); // make it quite empty -> quite efficient. + // default seems to be 1. + } + + virtual bool Close() { + for (typename MapType::iterator iter = map_.begin(); + iter != map_.end(); + ++iter) { + if (iter->second) + delete iter->second; + } + map_.clear(); + first_deleted_string_ = ""; + to_delete_iter_valid_ = false; + return this->CloseInternal(); + } + + virtual bool HasKey(const std::string &key) { + HandlePendingDelete(); + return FindKeyInternal(key, NULL); + } + virtual const T & Value(const std::string &key) { + HandlePendingDelete(); + const T *ans_ptr = NULL; + if (FindKeyInternal(key, &ans_ptr)) + return *ans_ptr; + else + KALDI_ERR << "Value() called but no such key " << key + << " in archive " << PrintableRxfilename(archive_rxfilename_); + return *(const T*)NULL; // keep compiler happy. + } + virtual ~RandomAccessTableReaderUnsortedArchiveImpl() { + if (this->IsOpen()) + if (!Close()) // more specific warning will already have been printed. + // we are in some kind of error state & user did not find out by + // calling Close(). + KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is " + << rspecifier_; + } + private: + void HandlePendingDelete() { + if (to_delete_iter_valid_) { + to_delete_iter_valid_ = false; + delete to_delete_iter_->second; // Delete Holder object. + if (first_deleted_string_.length() == 0) + first_deleted_string_ = to_delete_iter_->first; + map_.erase(to_delete_iter_); // delete that element. + } + } + + // FindKeyInternal tries to find the key in the map "map_" + // If it is not already there, it reads ahead either until it finds the + // key, or until end of file. If called with value_ptr == NULL, + // it assumes it's called from HasKey() and just returns true or false + // and doesn't otherwise have side effects. If called with value_ptr != + // NULL, it assumes it's called from Value(). Thus, it will crash + // if it cannot find the key. If it can find it it puts its address in + // *value_ptr, and if opts_once == true it will mark that element of the + // map to be deleted. + + bool FindKeyInternal(const std::string &key, const T **value_ptr = NULL) { + typename MapType::iterator iter = map_.find(key); + if (iter != map_.end()) { // Found in the map... + if (value_ptr == NULL) { // called from HasKey + return true; // this is all we have to do. + } else { + *value_ptr = &(iter->second->Value()); + if (opts_.once) { // value won't be needed again, so mark + // for deletion. + to_delete_iter_ = iter; // pending delete. + KALDI_ASSERT(!to_delete_iter_valid_); + to_delete_iter_valid_ = true; + } + return true; + } + } + while (state_ == kNoObject) { + ReadNextObject(); + if (state_ == kHaveObject) { // Successfully read object. + state_ = kNoObject; // we are about to transfer ownership + // of the object in holder_ to map_. + // Insert it into map_. + std::pair pr = + map_.insert(typename MapType::value_type(cur_key_, holder_)); + + if (!pr.second) { // Was not inserted-- previous element w/ same key + delete holder_; // map was not changed, no ownership transferred. + holder_ = NULL; + KALDI_ERR << "Error in RandomAccessTableReader: duplicate key " + << cur_key_ << " in archive " << archive_rxfilename_; + } + holder_ = NULL; // ownership transferred to map_. + if (cur_key_ == key) { // the one we wanted.. + if (value_ptr == NULL) { // called from HasKey + return true; + } else { // called from Value() + *value_ptr = &(pr.first->second->Value()); // this gives us the + // Value() from the Holder in the map. + if (opts_.once) { // mark for deletion, as won't be needed again. + to_delete_iter_ = pr.first; + KALDI_ASSERT(!to_delete_iter_valid_); + to_delete_iter_valid_ = true; + } + return true; + } + } + } + } + if (opts_.once && key == first_deleted_string_) { + KALDI_ERR << "You specified the once (o) option but " + << "you are calling using key " << key + << " more than once: rspecifier is " << rspecifier_; + } + return false; // We read the entire archive (or got to error state) and didn't + // find it. + } + + typedef unordered_map MapType; + MapType map_; + + typename MapType::iterator to_delete_iter_; + bool to_delete_iter_valid_; + + std::string first_deleted_string_; // keep the first string we deleted + // from map_ (if opts_.once == true). It's for an inexact spot-check that the + // "once" option isn't being used incorrectly. + +}; + + + + + +template +RandomAccessTableReader::RandomAccessTableReader(const std::string &rspecifier): + impl_(NULL) { + if (rspecifier != "" && !Open(rspecifier)) + KALDI_ERR << "Error opening RandomAccessTableReader object " + " (rspecifier is: " << rspecifier << ")"; +} + +template +bool RandomAccessTableReader::Open(const std::string &rspecifier) { + if (IsOpen()) + KALDI_ERR << "Already open."; + RspecifierOptions opts; + RspecifierType rs = ClassifyRspecifier(rspecifier, NULL, &opts); + switch (rs) { + case kScriptRspecifier: + impl_ = new RandomAccessTableReaderScriptImpl(); + break; + case kArchiveRspecifier: + if (opts.sorted) { + if (opts.called_sorted) // "doubly" sorted case. + impl_ = new RandomAccessTableReaderDSortedArchiveImpl(); + else + impl_ = new RandomAccessTableReaderSortedArchiveImpl(); + } else impl_ = new RandomAccessTableReaderUnsortedArchiveImpl(); + break; + case kNoRspecifier: default: + KALDI_WARN << "Invalid rspecifier: " + << rspecifier; + return false; + } + if (impl_->Open(rspecifier)) + return true; + else { + // Warning will already have been printed. + delete impl_; + impl_ = NULL; + return false; + } +} + +template +bool RandomAccessTableReader::HasKey(const std::string &key) { + CheckImpl(); + if (!IsToken(key)) + KALDI_ERR << "Invalid key \"" << key << '"'; + return impl_->HasKey(key); +} + + +template +const typename RandomAccessTableReader::T& +RandomAccessTableReader::Value(const std::string &key) { + CheckImpl(); + return impl_->Value(key); +} + +template +bool RandomAccessTableReader::Close() { + CheckImpl(); + bool ans =impl_->Close(); + delete impl_; + impl_ = NULL; + return ans; +} + +template +RandomAccessTableReader::~RandomAccessTableReader() { + if (IsOpen() && !Close()) // call Close() yourself to stop this being thrown. + KALDI_ERR << "failure detected in destructor."; +} + +template +void SequentialTableReader::CheckImpl() const { + if (!impl_) { + KALDI_ERR << "Trying to use empty SequentialTableReader (perhaps you " + << "passed the empty string as an argument to a program?)"; + } +} + +template +void RandomAccessTableReader::CheckImpl() const { + if (!impl_) { + KALDI_ERR << "Trying to use empty RandomAccessTableReader (perhaps you " + << "passed the empty string as an argument to a program?)"; + } +} + +template +void TableWriter::CheckImpl() const { + if (!impl_) { + KALDI_ERR << "Trying to use empty TableWriter (perhaps you " + << "passed the empty string as an argument to a program?)"; + } +} + +template +RandomAccessTableReaderMapped::RandomAccessTableReaderMapped( + const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename): + reader_(table_rxfilename), token_reader_(table_rxfilename.empty() ? "" : + utt2spk_rxfilename), + utt2spk_rxfilename_(utt2spk_rxfilename) { } + +template +bool RandomAccessTableReaderMapped::Open( + const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename) { + if (reader_.IsOpen()) reader_.Close(); + if (token_reader_.IsOpen()) token_reader_.Close(); + KALDI_ASSERT(!table_rxfilename.empty()); + if (!reader_.Open(table_rxfilename)) return false; // will have printed + // warning internally, probably. + if (!utt2spk_rxfilename.empty()) { + if (!token_reader_.Open(utt2spk_rxfilename)) { + reader_.Close(); + return false; + } + } + return true; +} + + +template +bool RandomAccessTableReaderMapped::HasKey(const std::string &utt) { + // We don't check IsOpen, we let the call go through to the member variable + // (reader_), which will crash with a more informative error message than + // we can give here, as we don't any longer know the rxfilename. + if (token_reader_.IsOpen()) { // We need to map the key from utt to spk. + if (!token_reader_.HasKey(utt)) + KALDI_ERR << "Attempting to read key " << utt << ", which is not present " + << "in utt2spk map or similar map being read from " + << PrintableRxfilename(utt2spk_rxfilename_); + const std::string &spk = token_reader_.Value(utt); + return reader_.HasKey(spk); + } else { + return reader_.HasKey(utt); + } +} + +template +const typename Holder::T& RandomAccessTableReaderMapped::Value( + const std::string &utt) { + if (token_reader_.IsOpen()) { // We need to map the key from utt to spk. + if (!token_reader_.HasKey(utt)) + KALDI_ERR << "Attempting to read key " << utt << ", which is not present " + << "in utt2spk map or similar map being read from " + << PrintableRxfilename(utt2spk_rxfilename_); + const std::string &spk = token_reader_.Value(utt); + return reader_.Value(spk); + } else { + return reader_.Value(utt); + } +} + + + +/// @} + +} // end namespace kaldi + + + +#endif diff --git a/kaldi_io/src/kaldi/util/kaldi-table.h b/kaldi_io/src/kaldi/util/kaldi-table.h new file mode 100644 index 0000000..6f6cb98 --- /dev/null +++ b/kaldi_io/src/kaldi/util/kaldi-table.h @@ -0,0 +1,459 @@ +// util/kaldi-table.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_KALDI_TABLE_H_ +#define KALDI_UTIL_KALDI_TABLE_H_ + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/kaldi-holder.h" + +namespace kaldi { + +// Forward declarations +template class RandomAccessTableReaderImplBase; +template class SequentialTableReaderImplBase; +template class TableWriterImplBase; + +/// \addtogroup table_group +/// @{ + +// This header defines the Table classes (RandomAccessTableReader, +// SequentialTableReader and TableWriter) and explains what the Holder classes, +// which the Table class requires as a template argument, are like. It also +// explains the "rspecifier" and "wspecifier" concepts (these are strings that +// explain how to read/write objects via archives or scp files. A table is +// conceptually a collection of objects of a particular type T indexed by keys +// of type std::string (these Keys additionally have an order within each table). +// The Table classes are templated on a type (call it Holder) such that Holder::T +// is a typedef equal to T. + +// see kaldi-holder.h for detail on the Holder classes. + +typedef std::vector KeyList; + +// Documentation for "wspecifier" +// "wspecifier" describes how we write a set of objects indexed by keys. +// The basic, unadorned wspecifiers are as follows: +// +// ark:wxfilename +// scp:rxfilename +// ark,scp:filename,wxfilename +// ark,scp:filename,wxfilename +// +// +// We also allow the following modifiers: +// t means text mode. +// b means binary mode. +// f means flush the stream after writing each entry. +// (nf means don't flush, and isn't very useful as the default is to flush). +// p means permissive mode, when writing to an "scp" file only: will ignore +// missing scp entries, i.e. won't write anything for those files but will +// return success status). +// +// So the following are valid wspecifiers: +// ark,b,f:foo +// "ark,b,b:| gzip -c > foo" +// "ark,scp,t,nf:foo.ark,|gzip -c > foo.scp.gz" +// ark,b:- +// +// The meanings of rxfilename and wxfilename are as described in +// kaldi-stream.h (they are filenames but include pipes, stdin/stdout +// and so on; filename is a regular filename. +// + +// The ark:wxfilename type of wspecifier instructs the class to +// write directly to an archive. For small objects (e.g. lists of ints), +// the text archive format will generally be human readable with one line +// per entry in the archive. +// +// The type "scp:xfilename" refers to an scp file which should +// already exist on disk, and tells us where to write the data for +// each key (usually an actual file); each line of the scp file +// would be: +// key xfilename +// +// The type ark,scp:filename,wxfilename means +// we write both an archive and an scp file that specifies offsets into the +// archive, with lines like: +// key filename:12407 +// where the number is the byte offset into the file. +// In this case we restrict the archive-filename to be an actual filename, +// as we can't see a situtation where an extended filename would make sense +// for this (we can't fseek() in pipes). + +enum WspecifierType { + kNoWspecifier, + kArchiveWspecifier, + kScriptWspecifier, + kBothWspecifier +}; + +struct WspecifierOptions { + bool binary; + bool flush; + bool permissive; // will ignore absent scp entries. + WspecifierOptions(): binary(true), flush(false), permissive(false) { } +}; + +// ClassifyWspecifier returns the type of the wspecifier string, +// and (if pointers are non-NULL) outputs the extra information +// about the options, and the script and archive +// filenames. +WspecifierType ClassifyWspecifier(const std::string &wspecifier, + std::string *archive_wxfilename, + std::string *script_wxfilename, + WspecifierOptions *opts); + +// ReadScriptFile reads an .scp file in its entirety, and appends it +// (in order as it was in the scp file) in script_out_, which contains +// pairs of (key, xfilename). The .scp +// file format is: on each line, key xfilename +// where xfilename means rxfilename or wxfilename, and may contain internal spaces +// (we trim away any leading or trailing space). The key is space-free. +// ReadScriptFile returns true if the format was valid (empty files +// are valid). +// If 'print_warnings', it will print out warning messages that explain what kind +// of error there was. +bool ReadScriptFile(const std::string &rxfilename, + bool print_warnings, + std::vector > *script_out); + +// This version of ReadScriptFile works from an istream. +bool ReadScriptFile(std::istream &is, + bool print_warnings, + std::vector > *script_out); + +// Writes, for each entry in script, the first element, then ' ', then the second +// element then '\n'. Checks that the keys (first elements of pairs) are valid +// tokens (nonempty, no whitespace), and the values (second elements of pairs) +// are newline-free and contain no leading or trailing space. Returns true on +// success. +bool WriteScriptFile(const std::string &wxfilename, + const std::vector > &script); + +// This version writes to an ostream. +bool WriteScriptFile(std::ostream &os, + const std::vector > &script); + +// Documentation for "rspecifier" +// "rspecifier" describes how we read a set of objects indexed by keys. +// The possibilities are: +// +// ark:rxfilename +// scp:rxfilename +// +// We also allow various modifiers: +// o means the program will only ask for each key once, which enables +// the reader to discard already-asked-for values. +// s means the keys are sorted on input (means we don't have to read till +// eof if someone asked for a key that wasn't there). +// cs means that it is called in sorted order (we are generally asserting this +// based on knowledge of how the program works). +// p means "permissive", and causes it to skip over keys whose corresponding +// scp-file entries cannot be read. [and to ignore errors in archives and +// script files, and just consider the "good" entries]. +// We allow the negation of the options above, as in no, ns, np, +// but these aren't currently very useful (just equivalent to omitting the +// corresponding option). +// [any of the above options can be prefixed by n to negate them, e.g. no, ns, +// ncs, np; but these aren't currently useful as you could just omit the option]. +// +// b is ignored [for scripting convenience] +// t is ignored [for scripting convenience] +// +// +// So for instance the following would be a valid rspecifier: +// +// "o, s, p, ark:gunzip -c foo.gz|" + +struct RspecifierOptions { + // These options only make a difference for the RandomAccessTableReader class. + bool once; // we assert that the program will only ask for each key once. + bool sorted; // we assert that the keys are sorted. + bool called_sorted; // we assert that the (HasKey(), Value() functions will + // also be called in sorted order. [this implies "once" but not vice versa]. + bool permissive; // If "permissive", when reading from scp files it treats + // scp files that can't be read as if the corresponding key were not there. + // For archive files it will suppress errors getting thrown if the archive + + // is corrupted and can't be read to the end. + + RspecifierOptions(): once(false), sorted(false), + called_sorted(false), permissive(false) { } +}; + +enum RspecifierType { + kNoRspecifier, + kArchiveRspecifier, + kScriptRspecifier +}; + +RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename, + RspecifierOptions *opts); + +// Class Table is useful when you want the entire set of +// objects in memory. NOT IMPLEMENTED YET. +// It is the least scalable way of accessing data in Tables. +// The *TableReader and TableWriter classes are more scalable. + + +/// Allows random access to a collection +/// of objects in an archive or script file; see \ref io_sec_tables. +template +class RandomAccessTableReader { + public: + typedef typename Holder::T T; + + RandomAccessTableReader(): impl_(NULL) { } + + // This constructor equivalent to default constructor + "open", but + // throws on error. + RandomAccessTableReader(const std::string &rspecifier); + + // Opens the table. + bool Open(const std::string &rspecifier); + + // Returns true if table is open. + bool IsOpen() const { return (impl_ != NULL); } + + // Close() will close the table [throws if it was not open], + // and returns true on success (false if we were reading an + // archive and we discovered an error in the archive). + bool Close(); + + // Says if it has this key. + // If you are using the "permissive" (p) read option, + // it will return false for keys whose corresponding entry + // in the scp file cannot be read. + + bool HasKey(const std::string &key); + + // Value() may throw if you are reading an scp file, you + // do not have the "permissive" (p) option, and an entry + // in the scp file cannot be read. Typically you won't + // want to catch this error. + const T &Value(const std::string &key); + + ~RandomAccessTableReader(); + + // Allow copy-constructor only for non-opened readers (needed for inclusion in + // stl vector) + RandomAccessTableReader(const RandomAccessTableReader &other): + impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); } + private: + // Disallow assignment. + RandomAccessTableReader &operator=(const RandomAccessTableReader&); + void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error + // message and dies (with KALDI_ERR) if NULL. + RandomAccessTableReaderImplBase *impl_; +}; + + + +/// A templated class for reading objects sequentially from an archive or script +/// file; see \ref io_sec_tables. +template +class SequentialTableReader { + public: + typedef typename Holder::T T; + + SequentialTableReader(): impl_(NULL) { } + + // This constructor equivalent to default constructor + "open", but + // throws on error. + SequentialTableReader(const std::string &rspecifier); + + // Opens the table. Returns exit status; but does throw if previously + // open stream was in error state. Call Close to stop this [anyway, + // calling Open more than once is not recommended.] + bool Open(const std::string &rspecifier); + + // Returns true if we're done. It will also return true if there's some kind + // of error and we can't read any more; in this case, you can detect the + // error by calling Close and checking the return status; otherwise + // the destructor will throw. + inline bool Done(); + + // Only valid to call Key() if Done() returned false. + inline std::string Key(); + + // FreeCurrent() is provided as an optimization to save memory, for large + // objects. It instructs the class to deallocate the current value. The + // reference Value() will/ be invalidated by this. + + void FreeCurrent(); + + // Return reference to the current value. + // The reference is valid till next call to this object. + // If will throw if you are reading an scp file, did not + // specify the "permissive" (p) option and the file cannot + // be read. [The permissive option makes it behave as if that + // key does not even exist, if the corresponding file cannot be + // read.] You probably wouldn't want to catch this exception; + // the user can just specify the p option in the rspecifier. + const T &Value(); + + // Next goes to the next key. It will not throw; any error will + // result in Done() returning true, and then the destructor will + // throw unless you call Close(). + void Next(); + + // Returns true if table is open for reading (does not imply + // stream is in good state). + bool IsOpen() const; + + // Close() will return false (failure) if Done() became true + // because of an error/ condition rather than because we are + // really done [e.g. because of an error or early termination + // in the archive]. + // If there is an error and you don't call Close(), the destructor + // will fail. + // Close() + bool Close(); + + // The destructor may throw. This is the desired behaviour, as it's the way we + // signal the error to the user (to detect it, call Close(). The issue is that + // otherwise the user has no way to tell whether Done() returned true because + // we reached the end of the archive or script, or because there was an error + // that prevented further reading. + ~SequentialTableReader(); + + // Allow copy-constructor only for non-opened readers (needed for inclusion in + // stl vector) + SequentialTableReader(const SequentialTableReader &other): + impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); } + private: + // Disallow assignment. + SequentialTableReader &operator = (const SequentialTableReader&); + void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error + // message and dies (with KALDI_ERR) if NULL. + SequentialTableReaderImplBase *impl_; +}; + + +/// A templated class for writing objects to an +/// archive or script file; see \ref io_sec_tables. +template +class TableWriter { + public: + typedef typename Holder::T T; + + TableWriter(): impl_(NULL) { } + + // This constructor equivalent to default constructor + // + "open", but throws on error. See docs for + // wspecifier above. + TableWriter(const std::string &wspecifier); + + // Opens the table. See docs for wspecifier above. + // If it returns true, it is open. + bool Open(const std::string &wspecifier); + + // Returns true if open for writing. + bool IsOpen() const; + + // Write the object. Throws std::runtime_error on error (via the + // KALDI_ERR macro) + inline void Write(const std::string &key, const T &value) const; + + + // Flush will flush any archive; it does not return error status + // or throw, any errors will be reported on the next Write or Close. + // Useful if we may be writing to a command in a pipe and want + // to ensure good CPU utilization. + void Flush(); + + // Close() is not necessary to call, as the destructor + // closes it; it's mainly useful if you want to handle + // error states because the destructor will throw on + // error if you do not call Close(). + bool Close(); + + ~TableWriter(); + + // Allow copy-constructor only for non-opened writers (needed for inclusion in + // stl vector) + TableWriter(const TableWriter &other): impl_(NULL) { + KALDI_ASSERT(other.impl_ == NULL); + } + private: + TableWriter &operator = (const TableWriter&); // Disallow assignment. + void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error + // message and dies (with KALDI_ERR) if NULL. + TableWriterImplBase *impl_; +}; + + +/// This class is for when you are reading something in random access, but +/// it may actually be stored per-speaker (or something similar) but the +/// keys you're using are per utterance. So you also provide an "rxfilename" +/// for a file containing lines like +/// utt1 spk1 +/// utt2 spk1 +/// utt3 spk1 +/// and so on. Note: this is optional; if it is an empty string, we just won't +/// do the mapping. Also, "table_rxfilename" may be the empty string (as for +/// a regular table), in which case the table just won't be opened. +/// We provide only the most frequently used of the functions of RandomAccessTableReader. + +template +class RandomAccessTableReaderMapped { + public: + typedef typename Holder::T T; + /// Note: "utt2spk_rxfilename" will in the normal case be an rxfilename + /// for an utterance to speaker map, but this code is general; it accepts + /// a generic map. + RandomAccessTableReaderMapped(const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename); + + RandomAccessTableReaderMapped() {}; + + /// Note: when calling Open, utt2spk_rxfilename may be empty. + bool Open(const std::string &table_rxfilename, + const std::string &utt2spk_rxfilename); + + bool HasKey(const std::string &key); + const T &Value(const std::string &key); + inline bool IsOpen() const { return reader_.IsOpen(); } + inline bool Close() { return reader_.Close(); } + + + + // The default copy-constructor will do what we want: it will crash + // for already-opened readers, by calling the member-variable copy-constructors. + private: + // Disallow assignment. + RandomAccessTableReaderMapped &operator=(const RandomAccessTableReaderMapped&); + RandomAccessTableReader reader_; + RandomAccessTableReader token_reader_; + std::string utt2spk_rxfilename_; // Used only in diagnostic messages. +}; + + +/// @} end "addtogroup table_group" +} // end namespace kaldi + +#include "kaldi-table-inl.h" + +#endif // KALDI_UTIL_KALDI_TABLE_H_ diff --git a/kaldi_io/src/kaldi/util/parse-options.h b/kaldi_io/src/kaldi/util/parse-options.h new file mode 100644 index 0000000..f563b54 --- /dev/null +++ b/kaldi_io/src/kaldi/util/parse-options.h @@ -0,0 +1,264 @@ +// util/parse-options.h + +// Copyright 2009-2011 Karel Vesely; Microsoft Corporation; +// Saarland University (Author: Arnab Ghoshal); +// Copyright 2012-2013 Frantisek Skala; Arnab Ghoshal + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_PARSE_OPTIONS_H_ +#define KALDI_UTIL_PARSE_OPTIONS_H_ + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "itf/options-itf.h" + +namespace kaldi { + +/// The class ParseOptions is for parsing command-line options; see +/// \ref parse_options for more documentation. +class ParseOptions : public OptionsItf { + public: + explicit ParseOptions(const char *usage) : + print_args_(true), help_(false), usage_(usage), argc_(0), argv_(NULL), + prefix_(""), other_parser_(NULL) { +#ifndef _MSC_VER // This is just a convenient place to set the stderr to line + setlinebuf(stderr); // buffering mode, since it's called at program start. +#endif // This helps ensure different programs' output is not mixed up. + RegisterStandard("config", &config_, "Configuration file to read (this " + "option may be repeated)"); + RegisterStandard("print-args", &print_args_, + "Print the command line arguments (to stderr)"); + RegisterStandard("help", &help_, "Print out usage message"); + RegisterStandard("verbose", &g_kaldi_verbose_level, + "Verbose level (higher->more logging)"); + } + + /** + This is a constructor for the special case where some options are + registered with a prefix to avoid conflicts. The object thus created will + only be used temporarily to register an options class with the original + options parser (which is passed as the *other pointer) using the given + prefix. It should not be used for any other purpose, and the prefix must + not be the empty string. It seems to be the least bad way of implementing + options with prefixes at this point. + Example of usage is: + ParseOptions po; // original ParseOptions object + ParseOptions po_mfcc("mfcc", &po); // object with prefix. + MfccOptions mfcc_opts; + mfcc_opts.Register(&po_mfcc); + The options will now get registered as, e.g., --mfcc.frame-shift=10.0 + instead of just --frame-shift=10.0 + */ + ParseOptions(const std::string &prefix, OptionsItf *other); + + ~ParseOptions() {} + + // Methods from the interface + void Register(const std::string &name, + bool *ptr, const std::string &doc); + void Register(const std::string &name, + int32 *ptr, const std::string &doc); + void Register(const std::string &name, + uint32 *ptr, const std::string &doc); + void Register(const std::string &name, + float *ptr, const std::string &doc); + void Register(const std::string &name, + double *ptr, const std::string &doc); + void Register(const std::string &name, + std::string *ptr, const std::string &doc); + + /// If called after registering an option and before calling + /// Read(), disables that option from being used. Will crash + /// at runtime if that option had not been registered. + void DisableOption(const std::string &name); + + /// This one is used for registering standard parameters of all the programs + template + void RegisterStandard(const std::string &name, + T *ptr, const std::string &doc); + + /** + Parses the command line options and fills the ParseOptions-registered + variables. This must be called after all the variables were registered!!! + + Initially the variables have implicit values, + then the config file values are set-up, + finally the command line vaues given. + Returns the first position in argv that was not used. + [typically not useful: use NumParams() and GetParam(). ] + */ + int Read(int argc, const char *const *argv); + + /// Prints the usage documentation [provided in the constructor]. + void PrintUsage(bool print_command_line = false); + /// Prints the actual configuration of all the registered variables + void PrintConfig(std::ostream &os); + + /// Reads the options values from a config file. Must be called after + /// registering all options. This is usually used internally after the + /// standard --config option is used, but it may also be called from a + /// program. + void ReadConfigFile(const std::string &filename); + + /// Number of positional parameters (c.f. argc-1). + int NumArgs() const; + + /// Returns one of the positional parameters; 1-based indexing for argc/argv + /// compatibility. Will crash if param is not >=1 and <=NumArgs(). + std::string GetArg(int param) const; + + std::string GetOptArg(int param) const { + return (param <= NumArgs() ? GetArg(param) : ""); + } + + /// The following function will return a possibly quoted and escaped + /// version of "str", according to the current shell. Currently + /// this is just hardwired to bash. It's useful for debug output. + static std::string Escape(const std::string &str); + + private: + /// Template to register various variable types, + /// used for program-specific parameters + template + void RegisterTmpl(const std::string &name, T *ptr, const std::string &doc); + + // Following functions do just the datatype-specific part of the job + /// Register boolean variable + void RegisterSpecific(const std::string &name, const std::string &idx, + bool *b, const std::string &doc, bool is_standard); + /// Register int32 variable + void RegisterSpecific(const std::string &name, const std::string &idx, + int32 *i, const std::string &doc, bool is_standard); + /// Register unsinged int32 variable + void RegisterSpecific(const std::string &name, const std::string &idx, + uint32 *u, + const std::string &doc, bool is_standard); + /// Register float variable + void RegisterSpecific(const std::string &name, const std::string &idx, + float *f, const std::string &doc, bool is_standard); + /// Register double variable [useful as we change BaseFloat type]. + void RegisterSpecific(const std::string &name, const std::string &idx, + double *f, const std::string &doc, bool is_standard); + /// Register string variable + void RegisterSpecific(const std::string &name, const std::string &idx, + std::string *s, const std::string &doc, + bool is_standard); + + /// Does the actual job for both kinds of parameters + /// Does the common part of the job for all datatypes, + /// then calls RegisterSpecific + template + void RegisterCommon(const std::string &name, + T *ptr, const std::string &doc, bool is_standard); + + /// SplitLongArg parses an argument of the form --a=b, --a=, or --a, + /// and sets "has_equal_sign" to true if an equals-sign was parsed.. + /// this is needed in order to correctly allow --x for a boolean option + /// x, and --y= for a string option y, and to disallow --x= and --y. + void SplitLongArg(std::string in, std::string *key, std::string *value, + bool *has_equal_sign); + + void NormalizeArgName(std::string *str); + + /// Set option with name "key" to "value"; will crash if can't do it. + /// "has_equal_sign" is used to allow --x for a boolean option x, + /// and --y=, for a string option y. + bool SetOption(const std::string &key, const std::string &value, + bool has_equal_sign); + + bool ToBool(std::string str); + int32 ToInt(std::string str); + uint32 ToUInt(std::string str); + float ToFloat(std::string str); + double ToDouble(std::string str); + + // maps for option variables + std::map bool_map_; + std::map int_map_; + std::map uint_map_; + std::map float_map_; + std::map double_map_; + std::map string_map_; + + /** + Structure for options' documentation + */ + struct DocInfo { + DocInfo() {} + DocInfo(const std::string &name, const std::string &usemsg) + : name_(name), use_msg_(usemsg), is_standard_(false) {} + DocInfo(const std::string &name, const std::string &usemsg, + bool is_standard) + : name_(name), use_msg_(usemsg), is_standard_(is_standard) {} + + std::string name_; + std::string use_msg_; + bool is_standard_; + }; + typedef std::map DocMapType; + DocMapType doc_map_; ///< map for the documentation + + bool print_args_; ///< variable for the implicit --print-args parameter + bool help_; ///< variable for the implicit --help parameter + std::string config_; ///< variable for the implicit --config parameter + std::vector positional_args_; + const char *usage_; + int argc_; + const char *const *argv_; + + /// These members are not normally used. They are only used when the object + /// is constructed with a prefix + std::string prefix_; + OptionsItf *other_parser_; +}; + +/// This template is provided for convenience in reading config classes from +/// files; this is not the standard way to read configuration options, but may +/// occasionally be needed. This function assumes the config has a function +/// "void Register(OptionsItf *po)" which it can call to register the +/// ParseOptions object. +template void ReadConfigFromFile(const std::string config_filename, + C *c) { + std::ostringstream usage_str; + usage_str << "Parsing config from " + << "from '" << config_filename << "'"; + ParseOptions po(usage_str.str().c_str()); + c->Register(&po); + po.ReadConfigFile(config_filename); +} + +/// This variant of the template ReadConfigFromFile is for if you need to read +/// two config classes from the same file. +template void ReadConfigsFromFile(const std::string config_filename, + C1 *c1, C2 *c2) { + std::ostringstream usage_str; + usage_str << "Parsing config from " + << "from '" << config_filename << "'"; + ParseOptions po(usage_str.str().c_str()); + c1->Register(&po); + c2->Register(&po); + po.ReadConfigFile(config_filename); +} + + + +} // namespace kaldi + +#endif // KALDI_UTIL_PARSE_OPTIONS_H_ diff --git a/kaldi_io/src/kaldi/util/simple-io-funcs.h b/kaldi_io/src/kaldi/util/simple-io-funcs.h new file mode 100644 index 0000000..56573e4 --- /dev/null +++ b/kaldi_io/src/kaldi/util/simple-io-funcs.h @@ -0,0 +1,56 @@ +// util/simple-io-funcs.h + +// Copyright 2009-2011 Microsoft Corporation; Jan Silovsky + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_ +#define KALDI_UTIL_SIMPLE_IO_FUNCS_H_ + +#include "kaldi-io.h" + +// This header contains some utilities for reading some common, simple text formats: +// integers in files, one per line, and integers in files, possibly multiple per line. +// these are not really fully native Kaldi formats; they are mostly for small files that +// might be generated by scripts, and can be read all at one time. +// for longer files of this type, we would probably use the Table code. + +namespace kaldi { + +/// WriteToList attempts to write this list of integers, one per line, +/// to the given file, in text format. +/// returns true if succeeded. +bool WriteIntegerVectorSimple(std::string wxfilename, const std::vector &v); + +/// ReadFromList attempts to read this list of integers, one per line, +/// from the given file, in text format. +/// returns true if succeeded. +bool ReadIntegerVectorSimple(std::string rxfilename, std::vector *v); + +// This is a file format like: +// 1 2 +// 3 +// +// 4 5 6 +// etc. +bool WriteIntegerVectorVectorSimple(std::string wxfilename, const std::vector > &v); + +bool ReadIntegerVectorVectorSimple(std::string rxfilename, std::vector > *v); + + +} // end namespace kaldi. + + +#endif diff --git a/kaldi_io/src/kaldi/util/simple-options.h b/kaldi_io/src/kaldi/util/simple-options.h new file mode 100644 index 0000000..58816af --- /dev/null +++ b/kaldi_io/src/kaldi/util/simple-options.h @@ -0,0 +1,112 @@ +// util/simple-options.hh + +// Copyright 2013 Tanel Alumae, Tallinn University of Technology + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_SIMPLE_OPTIONS_H_ +#define KALDI_UTIL_SIMPLE_OPTIONS_H_ + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "itf/options-itf.h" + +namespace kaldi { + + +/// The class SimpleOptions is an implementation of OptionsItf that allows +/// setting and getting option values programmatically, i.e., via getter +/// and setter methods. It doesn't provide any command line parsing functionality. +/// The class ParseOptions should be used for command-line options. +class SimpleOptions : public OptionsItf { + public: + SimpleOptions() { + } + + virtual ~SimpleOptions() { + } + + // Methods from the interface + void Register(const std::string &name, bool *ptr, const std::string &doc); + void Register(const std::string &name, int32 *ptr, const std::string &doc); + void Register(const std::string &name, uint32 *ptr, const std::string &doc); + void Register(const std::string &name, float *ptr, const std::string &doc); + void Register(const std::string &name, double *ptr, const std::string &doc); + void Register(const std::string &name, std::string *ptr, + const std::string &doc); + + // set option with the specified key, return true if successful + bool SetOption(const std::string &key, const bool &value); + bool SetOption(const std::string &key, const int32 &value); + bool SetOption(const std::string &key, const uint32 &value); + bool SetOption(const std::string &key, const float &value); + bool SetOption(const std::string &key, const double &value); + bool SetOption(const std::string &key, const std::string &value); + bool SetOption(const std::string &key, const char* value); + + // get option with the specified key and put to 'value', + // return true if successful + bool GetOption(const std::string &key, bool *value); + bool GetOption(const std::string &key, int32 *value); + bool GetOption(const std::string &key, uint32 *value); + bool GetOption(const std::string &key, float *value); + bool GetOption(const std::string &key, double *value); + bool GetOption(const std::string &key, std::string *value); + + enum OptionType { + kBool, + kInt32, + kUint32, + kFloat, + kDouble, + kString + }; + + struct OptionInfo { + OptionInfo(const std::string &doc, OptionType type) : + doc(doc), type(type) { + } + std::string doc; + OptionType type; + }; + + std::vector > GetOptionInfoList(); + + /* + * Puts the type of the option with name 'key' in the argument 'type'. + * Return true if such option is found, false otherwise. + */ + bool GetOptionType(const std::string &key, OptionType *type); + + private: + + std::vector > option_info_list_; + + // maps for option variables + std::map bool_map_; + std::map int_map_; + std::map uint_map_; + std::map float_map_; + std::map double_map_; + std::map string_map_; +}; + +} // namespace kaldi + +#endif // KALDI_UTIL_SIMPLE_OPTIONS_H_ diff --git a/kaldi_io/src/kaldi/util/stl-utils.h b/kaldi_io/src/kaldi/util/stl-utils.h new file mode 100644 index 0000000..12526ff --- /dev/null +++ b/kaldi_io/src/kaldi/util/stl-utils.h @@ -0,0 +1,327 @@ +// util/stl-utils.h + +// Copyright 2009-2011 Microsoft Corporation; Saarland University + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_STL_UTILS_H_ +#define KALDI_UTIL_STL_UTILS_H_ + +#include +#include +#include +#include +#include +#include "base/kaldi-common.h" + +#ifdef _MSC_VER +#include +#include +using std::unordered_map; +using std::unordered_set; +#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) +#include +#include +using std::unordered_map; +using std::unordered_set; +#else +#include +#include +using std::tr1::unordered_map; +using std::tr1::unordered_set; +#endif + + +namespace kaldi { + +/// Sorts and uniq's (removes duplicates) from a vector. +template +inline void SortAndUniq(std::vector *vec) { + std::sort(vec->begin(), vec->end()); + vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); +} + + +/// Returns true if the vector is sorted. +template +inline bool IsSorted(const std::vector &vec) { + typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); + if (iter == end) return true; + while (1) { + typename std::vector::const_iterator next_iter = iter; + ++next_iter; + if (next_iter == end) return true; // end of loop and nothing out of order + if (*next_iter < *iter) return false; + iter = next_iter; + } +} + + +/// Returns true if the vector is sorted and contains each element +/// only once. +template +inline bool IsSortedAndUniq(const std::vector &vec) { + typename std::vector::const_iterator iter = vec.begin(), end = vec.end(); + if (iter == end) return true; + while (1) { + typename std::vector::const_iterator next_iter = iter; + ++next_iter; + if (next_iter == end) return true; // end of loop and nothing out of order + if (*next_iter <= *iter) return false; + iter = next_iter; + } +} + + +/// Removes duplicate elements from a sorted list. +template +inline void Uniq(std::vector *vec) { // must be already sorted. + KALDI_PARANOID_ASSERT(IsSorted(*vec)); + KALDI_ASSERT(vec); + vec->erase(std::unique(vec->begin(), vec->end()), vec->end()); +} + +/// Copies the elements of a set to a vector. +template +void CopySetToVector(const std::set &s, std::vector *v) { + // adds members of s to v, in sorted order from lowest to highest + // (because the set was in sorted order). + KALDI_ASSERT(v != NULL); + v->resize(s.size()); + typename std::set::const_iterator siter = s.begin(), send = s.end(); + typename std::vector::iterator viter = v->begin(); + for (; siter != send; ++siter, ++viter) { + *viter = *siter; + } +} + +template +void CopySetToVector(const unordered_set &s, std::vector *v) { + // adds members of s to v, in sorted order from lowest to highest + // (because the set was in sorted order). + KALDI_ASSERT(v != NULL); + v->resize(s.size()); + typename unordered_set::const_iterator siter = s.begin(), send = s.end(); + typename std::vector::iterator viter = v->begin(); + for (; siter != send; ++siter, ++viter) { + *viter = *siter; + } +} + + +/// Copies the (key, value) pairs in a map to a vector of pairs. +template +void CopyMapToVector(const std::map &m, + std::vector > *v) { + KALDI_ASSERT(v != NULL); + v->resize(m.size()); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + typename std::vector >::iterator viter = v->begin(); + for (; miter != mend; ++miter, ++viter) { + *viter = std::make_pair(miter->first, miter->second); + // do it like this because of const casting. + } +} + +/// Copies the keys in a map to a vector. +template +void CopyMapKeysToVector(const std::map &m, std::vector *v) { + KALDI_ASSERT(v != NULL); + v->resize(m.size()); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + typename std::vector::iterator viter = v->begin(); + for (; miter != mend; ++miter, ++viter) { + *viter = miter->first; + } +} + +/// Copies the values in a map to a vector. +template +void CopyMapValuesToVector(const std::map &m, std::vector *v) { + KALDI_ASSERT(v != NULL); + v->resize(m.size()); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + typename std::vector::iterator viter = v->begin(); + for (; miter != mend; ++miter, ++viter) { + *viter = miter->second; + } +} + +/// Copies the keys in a map to a set. +template +void CopyMapKeysToSet(const std::map &m, std::set *s) { + KALDI_ASSERT(s != NULL); + s->clear(); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + for (; miter != mend; ++miter) { + s->insert(s->end(), miter->first); + } +} + +/// Copies the values in a map to a set. +template +void CopyMapValuesToSet(const std::map &m, std::set *s) { + KALDI_ASSERT(s != NULL); + s->clear(); + typename std::map::const_iterator miter = m.begin(), mend = m.end(); + for (; miter != mend; ++miter) + s->insert(s->end(), miter->second); +} + + +/// Copies the contents of a vector to a set. +template +void CopyVectorToSet(const std::vector &v, std::set *s) { + KALDI_ASSERT(s != NULL); + s->clear(); + typename std::vector::const_iterator iter = v.begin(), end = v.end(); + for (; iter != end; ++iter) + s->insert(s->end(), *iter); + // s->end() is a hint in case v was sorted. will work regardless. +} + +/// Deletes any non-NULL pointers in the vector v, and sets +/// the corresponding entries of v to NULL +template +void DeletePointers(std::vector *v) { + KALDI_ASSERT(v != NULL); + typename std::vector::iterator iter = v->begin(), end = v->end(); + for (; iter != end; ++iter) { + if (*iter != NULL) { + delete *iter; + *iter = NULL; // set to NULL for extra safety. + } + } +} + +/// Returns true if the vector of pointers contains NULL pointers. +template +bool ContainsNullPointers(const std::vector &v) { + typename std::vector::const_iterator iter = v.begin(), end = v.end(); + for (; iter != end; ++iter) + if (*iter == static_cast (NULL)) return true; + return false; +} + +/// Copies the contents a vector of one type to a vector +/// of another type. +template +void CopyVectorToVector(const std::vector &vec_in, std::vector *vec_out) { + KALDI_ASSERT(vec_out != NULL); + vec_out->resize(vec_in.size()); + for (size_t i = 0; i < vec_in.size(); i++) + (*vec_out)[i] = static_cast (vec_in[i]); +} + +/// A hashing function-object for vectors. +template +struct VectorHasher { // hashing function for vector. + size_t operator()(const std::vector &x) const { + size_t ans = 0; + typename std::vector::const_iterator iter = x.begin(), end = x.end(); + for (; iter != end; ++iter) { + ans *= kPrime; + ans += *iter; + } + return ans; + } + VectorHasher() { // Check we're instantiated with an integer type. + KALDI_ASSERT_IS_INTEGER_TYPE(Int); + } + private: + static const int kPrime = 7853; +}; + +/// A hashing function-object for pairs of ints +template +struct PairHasher { // hashing function for pair + size_t operator()(const std::pair &x) const { + return x.first + x.second * kPrime; + } + PairHasher() { // Check we're instantiated with an integer type. + KALDI_ASSERT_IS_INTEGER_TYPE(Int); + } + private: + static const int kPrime = 7853; +}; + + +/// A hashing function object for strings. +struct StringHasher { // hashing function for std::string + size_t operator()(const std::string &str) const { + size_t ans = 0, len = str.length(); + const char *c = str.c_str(), *end = c + len; + for (; c != end; c++) { + ans *= kPrime; + ans += *c; + } + return ans; + } + private: + static const int kPrime = 7853; +}; + +/// Reverses the contents of a vector. +template +inline void ReverseVector(std::vector *vec) { + KALDI_ASSERT(vec != NULL); + size_t sz = vec->size(); + for (size_t i = 0; i < sz/2; i++) + std::swap( (*vec)[i], (*vec)[sz-1-i]); +} + + +/// Comparator object for pairs that compares only the first pair. +template +struct CompareFirstMemberOfPair { + inline bool operator() (const std::pair &p1, + const std::pair &p2) { + return p1.first < p2.first; + } +}; + +/// For a vector of pair where I is an integer and F a floating-point or +/// integer type, this function sorts a vector of type vector > on +/// the I value and then merges elements with equal I values, summing these over +/// the F component and then removing any F component with zero value. This +/// is for where the vector of pairs represents a map from the integer to float +/// component, with an "adding" type of semantics for combining the elements. +template +inline void MergePairVectorSumming(std::vector > *vec) { + KALDI_ASSERT_IS_INTEGER_TYPE(I); + CompareFirstMemberOfPair c; + std::sort(vec->begin(), vec->end(), c); // sort on 1st element. + typename std::vector >::iterator out = vec->begin(), + in = vec->begin(), end = vec->end(); + while (in < end) { + // We reach this point only at the first element of + // each stretch of identical .first elements. + *out = *in; + ++in; + while (in < end && in->first == out->first) { + out->second += in->second; // this is the merge operation. + ++in; + } + if (out->second != static_cast(0)) // Don't keep zero elements. + out++; + } + vec->erase(out, end); +} + +} // namespace kaldi + +#endif // KALDI_UTIL_STL_UTILS_H_ + diff --git a/kaldi_io/src/kaldi/util/table-types.h b/kaldi_io/src/kaldi/util/table-types.h new file mode 100644 index 0000000..313d1aa --- /dev/null +++ b/kaldi_io/src/kaldi/util/table-types.h @@ -0,0 +1,137 @@ +// util/table-types.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_UTIL_TABLE_TYPES_H_ +#define KALDI_UTIL_TABLE_TYPES_H_ +#include "base/kaldi-common.h" +#include "util/kaldi-table.h" +#include "util/kaldi-holder.h" +#include "matrix/matrix-lib.h" + +namespace kaldi { + +// This header defines typedefs that are specific instantiations of +// the Table types. + +/// \addtogroup table_types +/// @{ + +typedef TableWriter > > BaseFloatMatrixWriter; +typedef SequentialTableReader > > SequentialBaseFloatMatrixReader; +typedef RandomAccessTableReader > > RandomAccessBaseFloatMatrixReader; +typedef RandomAccessTableReaderMapped > > RandomAccessBaseFloatMatrixReaderMapped; + +typedef TableWriter > > DoubleMatrixWriter; +typedef SequentialTableReader > > SequentialDoubleMatrixReader; +typedef RandomAccessTableReader > > RandomAccessDoubleMatrixReader; +typedef RandomAccessTableReaderMapped > > RandomAccessDoubleMatrixReaderMapped; + +typedef TableWriter > CompressedMatrixWriter; + +typedef TableWriter > > BaseFloatVectorWriter; +typedef SequentialTableReader > > SequentialBaseFloatVectorReader; +typedef RandomAccessTableReader > > RandomAccessBaseFloatVectorReader; +typedef RandomAccessTableReaderMapped > > RandomAccessBaseFloatVectorReaderMapped; + +typedef TableWriter > > DoubleVectorWriter; +typedef SequentialTableReader > > SequentialDoubleVectorReader; +typedef RandomAccessTableReader > > RandomAccessDoubleVectorReader; + +typedef TableWriter > > BaseFloatCuMatrixWriter; +typedef SequentialTableReader > > SequentialBaseFloatCuMatrixReader; +typedef RandomAccessTableReader > > RandomAccessBaseFloatCuMatrixReader; +typedef RandomAccessTableReaderMapped > > RandomAccessBaseFloatCuMatrixReaderMapped; + +typedef TableWriter > > DoubleCuMatrixWriter; +typedef SequentialTableReader > > SequentialDoubleCuMatrixReader; +typedef RandomAccessTableReader > > RandomAccessDoubleCuMatrixReader; +typedef RandomAccessTableReaderMapped > > RandomAccessDoubleCuMatrixReaderMapped; + +typedef TableWriter > > BaseFloatCuVectorWriter; +typedef SequentialTableReader > > SequentialBaseFloatCuVectorReader; +typedef RandomAccessTableReader > > RandomAccessBaseFloatCuVectorReader; +typedef RandomAccessTableReaderMapped > > RandomAccessBaseFloatCuVectorReaderMapped; + +typedef TableWriter > > DoubleCuVectorWriter; +typedef SequentialTableReader > > SequentialDoubleCuVectorReader; +typedef RandomAccessTableReader > > RandomAccessDoubleCuVectorReader; + + +typedef TableWriter > Int32Writer; +typedef SequentialTableReader > SequentialInt32Reader; +typedef RandomAccessTableReader > RandomAccessInt32Reader; + +typedef TableWriter > Int32VectorWriter; +typedef SequentialTableReader > SequentialInt32VectorReader; +typedef RandomAccessTableReader > RandomAccessInt32VectorReader; + +typedef TableWriter > Int32VectorVectorWriter; +typedef SequentialTableReader > SequentialInt32VectorVectorReader; +typedef RandomAccessTableReader > RandomAccessInt32VectorVectorReader; + +typedef TableWriter > Int32PairVectorWriter; +typedef SequentialTableReader > SequentialInt32PairVectorReader; +typedef RandomAccessTableReader > RandomAccessInt32PairVectorReader; + +typedef TableWriter > BaseFloatPairVectorWriter; +typedef SequentialTableReader > SequentialBaseFloatPairVectorReader; +typedef RandomAccessTableReader > RandomAccessBaseFloatPairVectorReader; + +typedef TableWriter > BaseFloatWriter; +typedef SequentialTableReader > SequentialBaseFloatReader; +typedef RandomAccessTableReader > RandomAccessBaseFloatReader; +typedef RandomAccessTableReaderMapped > RandomAccessBaseFloatReaderMapped; + +typedef TableWriter > DoubleWriter; +typedef SequentialTableReader > SequentialDoubleReader; +typedef RandomAccessTableReader > RandomAccessDoubleReader; + +typedef TableWriter > BoolWriter; +typedef SequentialTableReader > SequentialBoolReader; +typedef RandomAccessTableReader > RandomAccessBoolReader; + + + +/// TokenWriter is a writer specialized for std::string where the strings +/// are nonempty and whitespace-free. T == std::string +typedef TableWriter TokenWriter; +typedef SequentialTableReader SequentialTokenReader; +typedef RandomAccessTableReader RandomAccessTokenReader; + + +/// TokenVectorWriter is a writer specialized for sequences of +/// std::string where the strings are nonempty and whitespace-free. +/// T == std::vector +typedef TableWriter TokenVectorWriter; +// Ditto for SequentialTokenVectorReader. +typedef SequentialTableReader SequentialTokenVectorReader; +typedef RandomAccessTableReader RandomAccessTokenVectorReader; + + +/// @} + +// Note: for FST reader/writer, see ../fstext/fstext-utils.h +// [not done yet]. + +} // end namespace kaldi + + + +#endif diff --git a/kaldi_io/src/kaldi/util/text-utils.h b/kaldi_io/src/kaldi/util/text-utils.h new file mode 100644 index 0000000..1d85c47 --- /dev/null +++ b/kaldi_io/src/kaldi/util/text-utils.h @@ -0,0 +1,169 @@ +// util/text-utils.h + +// Copyright 2009-2011 Saarland University; Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_UTIL_TEXT_UTILS_H_ +#define KALDI_UTIL_TEXT_UTILS_H_ + +#include +#include +#include +#include +#include +#include + +#include "base/kaldi-common.h" + +namespace kaldi { + +/// Split a string using any of the single character delimiters. +/// If omit_empty_strings == true, the output will contain any +/// nonempty strings after splitting on any of the +/// characters in the delimiter. If omit_empty_strings == false, +/// the output will contain n+1 strings if there are n characters +/// in the set "delim" within the input string. In this case +/// the empty string is split to a single empty string. +void SplitStringToVector(const std::string &full, const char *delim, + bool omit_empty_strings, + std::vector *out); + +/// Joins the elements of a vector of strings into a single string using +/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings +/// in the vector are skipped. A vector of empty strings results in an empty +/// string on the output. +void JoinVectorToString(const std::vector &vec_in, + const char *delim, bool omit_empty_strings, + std::string *str_out); + + +/// Split a string (e.g. 1:2:3) into a vector of integers. +/// The delimiting char may be any character in "delim". +/// returns true on success, false on failure. +/// If omit_empty_strings == true, 1::2:3: will become +/// { 1, 2, 3 }. Otherwise it would be rejected. +/// Regardless of the value of omit_empty_strings, +/// the empty string is successfully parsed as an empty +/// vector of integers +template +bool SplitStringToIntegers(const std::string &full, + const char *delim, + bool omit_empty_strings, // typically false [but + // should probably be true + // if "delim" is spaces]. + std::vector *out) { + KALDI_ASSERT(out != NULL); + KALDI_ASSERT_IS_INTEGER_TYPE(I); + if ( *(full.c_str()) == '\0') { + out->clear(); + return true; + } + std::vector split; + SplitStringToVector(full, delim, omit_empty_strings, &split); + out->resize(split.size()); + for (size_t i = 0; i < split.size(); i++) { + const char *this_str = split[i].c_str(); + char *end = NULL; + long long int j = 0; + j = KALDI_STRTOLL(this_str, &end); + if (end == this_str || *end != '\0') { + out->clear(); + return false; + } else { + I jI = static_cast(j); + if (static_cast(jI) != j) { + // output type cannot fit this integer. + out->clear(); + return false; + } + (*out)[i] = jI; + } + } + return true; +} + +// This is defined for F = float and double. +template +bool SplitStringToFloats(const std::string &full, + const char *delim, + bool omit_empty_strings, // typically false + std::vector *out); + + +/// Converts a string into an integer via strtoll and returns false if there was +/// any kind of problem (i.e. the string was not an integer or contained extra +/// non-whitespace junk, or the integer was too large to fit into the type it is +/// being converted into). Only sets *out if everything was OK and it returns +/// true. +template +bool ConvertStringToInteger(const std::string &str, + Int *out) { + KALDI_ASSERT_IS_INTEGER_TYPE(Int); + const char *this_str = str.c_str(); + char *end = NULL; + errno = 0; + long long int i = KALDI_STRTOLL(this_str, &end); + if (end != this_str) + while (isspace(*end)) end++; + if (end == this_str || *end != '\0' || errno != 0) + return false; + Int iInt = static_cast(i); + if (static_cast(iInt) != i || (i<0 && !std::numeric_limits::is_signed)) { + return false; + } + *out = iInt; + return true; +} + + +/// ConvertStringToReal converts a string into either float or double via strtod, +/// and returns false if there was any kind of problem (i.e. the string was not a +/// floating point number or contained extra non-whitespace junk. +/// Be careful- this function will successfully read inf's or nan's. +bool ConvertStringToReal(const std::string &str, + double *out); +bool ConvertStringToReal(const std::string &str, + float *out); + + +/// Removes the beginning and trailing whitespaces from a string +void Trim(std::string *str); + + +/// Removes leading and trailing white space from the string, then splits on the +/// first section of whitespace found (if present), putting the part before the +/// whitespace in "first" and the rest in "rest". If there is no such space, +/// everything that remains after removing leading and trailing whitespace goes +/// in "first". +void SplitStringOnFirstSpace(const std::string &line, + std::string *first, + std::string *rest); + + +/// Returns true if "token" is nonempty, and all characters are +/// printable and whitespace-free. +bool IsToken(const std::string &token); + + +/// Returns true if "line" is free of \n characters and unprintable +/// characters, and does not contain leading or trailing whitespace. +bool IsLine(const std::string &line); + + +} // namespace kaldi + +#endif // KALDI_UTIL_TEXT_UTILS_H_ diff --git a/kaldi_io/src/kaldi/util/timer.h b/kaldi_io/src/kaldi/util/timer.h new file mode 100644 index 0000000..e3ee8d5 --- /dev/null +++ b/kaldi_io/src/kaldi/util/timer.h @@ -0,0 +1,27 @@ +// util/timer.h + +// Copyright 2014 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +// We are temporarily leaving this file to forward #includes to +// base-timer.h. Its use is deprecated; you should directrly +// #include base/timer.h +#ifndef KALDI_UTIL_TIMER_H_ +#define KALDI_UTIL_TIMER_H_ +#pragma message warning: please do not include util/timer.h, include base/timer.h (it has been moved) +#include "base/timer.h" +#endif -- cgit v1.2.3-70-g09d2