LOOS  v2.3.2
hierarchy.cpp
1 /*
2  hierarchy
3 
4  Based on Zhang, Bhatt, and Zuckerman; JCTC, DOI: 10.1021/ct1002384
5  and code provided by the Zuckerman Lab
6  (http://www.ccbb.pitt.edu/Faculty/zuckerman/software.html)
7 
8 
9  Given a trajectory whose structures have been binned into states via
10  reference structures, computes the MFPT between states and then
11  constructs a hierarchy of states based on exchange rates
12 
13 */
14 
15 
16 /*
17 
18  This file is part of LOOS.
19 
20  LOOS (Lightweight Object-Oriented Structure library)
21  Copyright (c) 2010, Tod D. Romo
22  Department of Biochemistry and Biophysics
23  School of Medicine & Dentistry, University of Rochester
24 
25  This package (LOOS) is free software: you can redistribute it and/or modify
26  it under the terms of the GNU General Public License as published by
27  the Free Software Foundation under version 3 of the License.
28 
29  This package is distributed in the hope that it will be useful,
30  but WITHOUT ANY WARRANTY; without even the implied warranty of
31  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32  GNU General Public License for more details.
33 
34  You should have received a copy of the GNU General Public License
35  along with this program. If not, see <http://www.gnu.org/licenses/>.
36 */
37 
38 // @cond TOOLS_INTERNAL
39 
40 #include <loos.hpp>
41 #include <boost/format.hpp>
42 
43 
44 using namespace std;
45 using namespace loos;
46 
47 typedef pair<uint, uint> uPair;
48 typedef vector<uint> vUint;
49 typedef vector<vUint> vvUint;
50 
51 
52 
53 // Simple class to sort pairs of numbers based on a third (the rate)
54 struct RatePair {
55  RatePair(const double d, const uint a, const uint b) :
56  rate(d), pair(a,b) { }
57 
58  bool operator<(const RatePair& x) const {
59  return(rate > x.rate);
60  }
61 
62  double rate;
63  uPair pair;
64 };
65 
66 
67 
68 // @endcond TOOLS_INTERNAL
69 
70 // Debugging generates a lot of information about the internal state
71 // of the code. This was used to validate its operation with respect
72 // to Dan's PERL code.
73 const bool debugging = false;
74 
75 
76 
77 double mfpt(const vector<uint>& assign, const uint x, const uint y) {
78  double fpt = 0.0;
79  uint n = 0;
80 
81  bool state = false;
82  uint start = 0;
83  for (uint j=0; j<assign.size(); ++j) {
84  if (!state) {
85  if (assign[j] == x) {
86  start = j;
87  state = true;
88  }
89  } else {
90  if (assign[j] == y) {
91  fpt += (j - start);
92  ++n;
93  state = false;
94  }
95  }
96  }
97 
98 
99  return(n != 0 ? static_cast<double>(n)/fpt : 0.0);
100 }
101 
102 
103 
104 DoubleMatrix computeRates(const string& fname) {
105  ifstream ifs(fname.c_str());
106  if (!ifs) {
107  cerr << "Error- unable to open " << fname << endl;
108  exit(-1);
109  }
110 
111 
112  vector<uint> assignments = readVector<uint>(ifs);
113  uint nbins = 0;
114  for (vector<uint>::iterator i = assignments.begin(); i != assignments.end(); ++i)
115  if (*i > 0 && static_cast<uint>(*i) > nbins)
116  nbins = *i;
117 
118  ++nbins; // Bins are 0-based
119 
120  DoubleMatrix M(nbins, nbins);
121  for (uint j=0; j<nbins; ++j)
122  for (uint i=0; i<nbins; ++i) {
123  if (i == j)
124  continue;
125  M(j, i) = mfpt(assignments, j, i);
126  }
127 
128  for (uint j=0; j<nbins-1; ++j)
129  for (uint i=j+1; i<nbins; ++i)
130  if (M(j, i) > 0.0 && M(i, j) > 0.0) {
131  double d = (M(j, i) + M(i, j)) / 2.0;
132  M(j, i) = d;
133  } else
134  M(j, i) = 0.0;
135 
136  return(M);
137 }
138 
139 
140 
141 
142 vector<uPair> sortRates(const DoubleMatrix& M) {
143  vector<RatePair> rates;
144  for (uint j=0; j<M.cols()-1; ++j)
145  for (uint i=j+1; i<M.cols(); ++i)
146  if (M(j, i) != 0.0)
147  rates.push_back(RatePair(M(j, i), j, i));
148 
149  sort(rates.begin(), rates.end());
150 
151  vector<uPair> pairs;
152  if (debugging)
153  cerr << "DEBUG> PAIR_BEGIN\n";
154 
155  for (vector<RatePair>::iterator i = rates.begin(); i != rates.end(); ++i) {
156  pairs.push_back(i->pair);
157  if (debugging)
158  cerr << boost::format("%d %d = %f\n") % i->pair.first % i->pair.second % i->rate;
159  }
160 
161  if (debugging)
162  cerr << "DEBUG> PAIR_END\n";
163 
164  return(pairs);
165 }
166 
167 
168 
169 void dumpMatrix(ostream& os, const vvUint& M) {
170  os << M.size() << endl;
171  for (uint j=0; j<M.size(); ++j) {
172  os << M[j].size() << "\t";
173  copy(M[j].begin(), M[j].end(), ostream_iterator<uint>(os, "\t"));
174  os << endl;
175  }
176 }
177 
178 
179 // Yes, this really needs to be commented... Maybe in the next release.
180 vvUint cluster(const vector<uPair>& pairs) {
181  vvUint states;
182  vUint list;
183 
184  list.push_back(pairs[0].first);
185  list.push_back(pairs[0].second);
186  states.push_back(list);
187 
188  // Skip the last pair so we have 2 states...
189 
190  for (uint i=1; i<pairs.size()-1; ++i) {
191  if (debugging)
192  cerr << boost::format("DEBUG> i=%d, first=%d, second=%d\n") % i % pairs[i].first % pairs[i].second;
193 
194  bool flag1 = false, flag2 = false;
195  uint bin1_state = 0, bin1_element = 0;
196  uint bin2_state = 0, bin2_element = 0;
197 
198  for (uint j=0; j<states.size(); ++j)
199  for (uint k=0; k<states[j].size(); ++k) {
200  if (states[j][k] == pairs[i].first) {
201  bin1_state = j;
202  bin1_element = k;
203  flag1 = true;
204  }
205  if (states[j][k] == pairs[i].second) {
206  bin2_state = j;
207  bin2_element = k;
208  flag2 = true;
209  }
210  }
211 
212  if (debugging)
213  cerr << boost::format("DEBUG> flag1=%d, flag2=%d\n") % flag1 % flag2;
214 
215  if (flag1 && flag2) {
216  uint big = bin1_state;
217  uint small = bin2_state;
218  if (bin1_state < bin2_state) {
219  big = bin2_state;
220  small = bin1_state;
221  }
222 
223  if (debugging)
224  cerr << boost::format("DEBUG> small=%d, big=%d\n") % small % big;
225 
226  bool flag3 = false;
227  for (uint w = 0; w<states[big].size() && !flag3; ++w)
228  for (uint z=0; z<states[small].size() && !flag3; ++z) {
229  bool failed = true;
230  for (uint y=0; y<=i; ++y)
231  if ( (states[small][z] == pairs[y].first && states[big][w] == pairs[y].second)
232  || (states[small][z] == pairs[y].second && states[big][w] == pairs[y].first) ) {
233  failed = false;
234  break;
235  }
236  if (failed) {
237  flag3 = true;
238  if (debugging)
239  cerr << boost::format("DEBUG> Check failed for w=%d, z=%d\n") % w % z;
240  }
241  }
242 
243 
244  if (!flag3) {
245 
246  if (debugging)
247  cerr << "DEBUG> *Merging states*\n";
248 
249  copy(states[big].begin(), states[big].end(), back_inserter(states[small]));
250  states.erase(states.begin() + big);
251 
252  }
253 
254  } else if (flag1) {
255 
256  bool failed = false;
257 
258  for (uint p=0; p<states[bin1_state].size() && !failed; ++p) {
259  if (p == bin1_element)
260  continue;
261 
262  failed = true;
263  for (uint q=0; q<i; ++q)
264  if ( (states[bin1_state][p] == pairs[q].first && pairs[i].second == pairs[q].second)
265  || (states[bin1_state][p] == pairs[q].second && pairs[i].second == pairs[q].first) ) {
266  failed = false;
267  break;
268  }
269  }
270 
271  if (debugging)
272  cerr << boost::format("DEBUG> [1] failed=%d\n") % failed;
273 
274  if (!failed)
275  states[bin1_state].push_back(pairs[i].second);
276 
277  } else if (flag2) {
278 
279 
280  bool failed = false;
281 
282  for (uint p=0; p<states[bin2_state].size() && !failed; ++p) {
283  if (p == bin2_element)
284  continue;
285 
286  failed = true;
287  for (uint q=0; q<i; ++q)
288  if ( (states[bin2_state][p] == pairs[q].first && pairs[i].first == pairs[q].second)
289  || (states[bin2_state][p] == pairs[q].second && pairs[i].first == pairs[q].first) ) {
290  failed = false;
291  break;
292  }
293  }
294 
295 
296  if (debugging)
297  cerr << boost::format("DEBUG> [1] failed=%d\n") % failed;
298 
299 
300  if (!failed)
301  states[bin2_state].push_back(pairs[i].first);
302 
303  } else {
304  if (debugging)
305  cerr << "DEBUG> Adding new state.\n";
306 
307  vUint newlist;
308  newlist.push_back(pairs[i].first);
309  newlist.push_back(pairs[i].second);
310  states.push_back(newlist);
311  }
312 
313  if (debugging) {
314  dumpMatrix(cerr,states);
315  cerr << "DEBUG> --------------------------------------\n";
316  }
317 
318 
319 
320  }
321 
322  if (debugging)
323  cerr << "DEBUG> final states = " << states.size() << endl;
324  return(states);
325 }
326 
327 
328 void findOrphans(vvUint& states, const uint max_states) {
329  vUint seen;
330 
331  if (debugging)
332  cerr << "DEBUG> Finding orphans- ";
333 
334  for (vvUint::iterator v = states.begin(); v != states.end(); ++v)
335  copy(v->begin(), v->end(), back_inserter(seen));
336 
337  vUint unseen;
338  for (uint i = 0; i<max_states; ++i)
339  if (find(seen.begin(), seen.end(), i) == seen.end())
340  unseen.push_back(i);
341 
342  if (debugging)
343  cerr << "found " << unseen.size() << endl;
344 
345  if (! unseen.empty())
346  states.push_back(unseen);
347 }
348 
349 
350 string fullHelpMessage(void) {
351  string msg =
352  "\n"
353  "SYNOPSIS\n"
354  "\tPerform a hierarchical clustering needed to determine effective sample size\n"
355  "\n"
356  "DESCRIPTION\n"
357  "\n"
358  "\tThis tool implements the hierarchical clustering algorithm as part of determining\n"
359  "effective sample size described in Zhang, Batt, and Zuckerman, JCTC (2010) 6:3048-57.\n"
360  "\n"
361  "EXAMPLES\n"
362  "\n"
363  "\thierarchy assignments.asc >zuckerman.states\n"
364  "\n"
365  "SEE ALSO\n"
366  "\tufidpick, assign_frames, neff, effsize.pl\n";
367 
368  return(msg);
369 }
370 
371 
372 
373 int main(int argc, char *argv[]) {
374  if (argc == 1) {
375  cout << "Usage- " << argv[0] << " assignments_file\n";
376  exit(0);
377  }
378 
379  string hdr = invocationHeader(argc, argv);
380  int k = 1;
381  DoubleMatrix M = computeRates(argv[k++]);
382  vector<uPair> pairs = sortRates(M);
383  if (pairs.empty()) {
384  cerr << "Error- hierarchy failed to compute rates. Double-check how the assignments\n"
385  << " file was generated. Did you use ufidpick to pick the fiducials?\n"
386  << " Is the cutoff reasonable? Is the selection correct (use model-select\n"
387  << " to confirm)?\n";
388  exit(-10);
389  }
390 
391  vvUint states = cluster(pairs);
392 
393  // Not all states will get clustered, so manually search for
394  // "orphaned" ones and add them in...
395  findOrphans(states, M.rows());
396 
397 
398  cout << "# " << hdr << endl;
399  dumpMatrix(cout, states);
400 
401 
402  // If this happens, you are likely very undersampled
403  if (states.size() != 2)
404  cerr << boost::format("Warning- clustering finished with %d states.\n") % states.size();
405 }
Simple matrix template class using policy classes to determine behavior.
Definition: MatrixImpl.hpp:53
STL namespace.
std::string invocationHeader(int argc, char *argv[])
Create an invocation header.
Definition: utils.cpp:124
Namespace for most things not already encapsulated within a class.