Still more updates to 'p'
[wiggle.git] / bestmatch.c
blob3d7357777abb42c18c440c6baadf5b1cbc9df2f9
1 /*
2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * Author: Neil Brown
22 * Email: <neilb@cse.unsw.edu.au>
23 * Paper: Neil Brown
24 * School of Computer Science and Engineering
25 * The University of New South Wales
26 * Sydney, 2052
27 * Australia
31 * Find the best match for a patch against
32 * a file.
33 * The quality of a match is the length of the match minus the
34 * differential between the endpoints.
35 * We progress through the matrix recording the best
36 * match as we find it.
38 * We perform a full diagonal bredth first traversal assessing
39 * the quality of matches at each point.
40 * At each point there are two or three previous points,
41 * up, back or diagonal if there is a match.
42 * We assess the value of the match at each point and choose the
43 * best. No match at all is given a score of -3.
45 * For any point, the best possible score using that point
46 * is a complete diagonal to the nearest edge. We ignore points
47 * which cannot contibute to a better overall score.
51 /* This structure keeps track of the current match at each point.
52 * It holds the start of the match as x,k where k is the
53 * diagonal, so y = x-k.
54 * Also the length of the match so far.
55 * If l == 0, there is no match.
58 #include <malloc.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include "wiggle.h"
64 struct v {
65 int x,y; /* location of start of match */
66 int val; /* value of match from x,y to here */
67 int k; /* diagonal of last match */
68 int inmatch; /* 1 if last point was a match */
69 int c; /* chunk number */
73 * Here must must determine the 'value' of a partial match.
74 * The input parameters are:
75 * length - the total number of symbols matches
76 * errs - the total number of insertions or deletions
77 * dif - the absolute difference between number of insertions and deletions.
79 * In general we want length to be high, errs to be low, and dif to be low.
80 * Particular questions that must be answered include:
81 * - When does adding an extra symbol after a small gap improve the match
82 * - When does a match become so bad that we would rather start again.
84 * We would like symetry in our answers so that a good sequence with an out-rider on
85 * one end is evaluated the same as a good sequence with an out-rider on the other end.
86 * However to do this we cannot really use value of the good sequence to weigh in the
87 * outriders favour as in the case of a leading outrider, we do not yet know the value of
88 * of the good sequence.
89 * First, we need an arbitrary number, X, to say "Given a single symbol, after X errors, we
90 * forget that symbol". 5 seems a good number.
91 * Next we need to understand how replacements compare to insertions or deletions.
92 * Probably a replacement is the same cost as an insertion or deletion.
93 * Finally, a few large stretches are better then lots of little ones, so the number
94 * of disjoint stretches should be kept low.
95 * So:
96 * Each match after the first adds 5 to value.
97 * The first match in a string adds 6.
98 * Each non-match subtracts one unless it is the other half of a replacement.
99 * A value of 0 causes us to forget where we are and start again.
101 * We need to not only assess the value at a particular location, but also
102 * assess the maximum value we could get if all remaining symbols matched, to
103 * help exclude parts of the matrix.
104 * The value of that possibility is 6 times the number of remaining symbols, -1 if we
105 * just had a match.
107 /* dir == 0 for match, 1 for k increase, -1 for k decrease */
108 static inline void update_value(struct v *v, int dir, int k, int x)
110 if (dir == 0) {
111 if (v->val <= 0) {
112 v->x = x-1;
113 v->y = x-k-1;
114 v->inmatch = 0;
115 v->val = 4;
117 v->val += 2+v->inmatch;
118 v->inmatch = 1;
119 v->k = k;
120 } else {
121 v->inmatch = 0;
122 if (dir * (v->k - k) > 0) {
123 /* other half of replacement */
124 } else {
125 v->val -= 1;
129 static inline int best_val(struct v *v, int max)
131 if (v->val <= 0)
132 return 4+max*3-1;
133 else
134 return max*3-1+v->inmatch+v->val;
137 #ifdef OLDSTUFF
138 #if 0
139 #define value(v,kk,xx) (v.l ? (v.l - abs(kk-v.k)): -3)
140 #else
141 # if 0
142 # define value(v,kk,xx) (v.l ? (v.l - (xx-v.x)/2): -3)
143 # else
144 # define value(v,kk,xx) (v.l ? (v.l - (xx-v.x)*2/v.l): -3)
145 # endif
146 #endif
147 #endif
148 struct best {
149 int xlo,ylo,xhi,yhi,val;
152 static inline int min(int a, int b) {
153 return a < b ? a : b;
156 void find_best(struct file *a, struct file *b,
157 int alo, int ahi,
158 int blo, int bhi, struct best *best)
160 int klo, khi, k;
161 int f;
163 struct v *valloc = malloc(sizeof(struct v)*((ahi-alo)+(bhi-blo)+5));
164 struct v *v = valloc + (bhi-alo+2);
166 k = klo = khi = alo-blo;
167 f = alo+blo; /* front that moves forward */
168 v[k].val = 0;
169 v[k].c = -1;
171 while (f < ahi+bhi) {
172 int x,y;
174 f++;
176 #if 0
177 if (f == ahi+bhi)
178 printf("f %d klo %d khi %d\n", f,klo,khi);
179 #endif
180 for (k=klo+1; k <= khi-1 ; k+=2) {
181 struct v vnew, vnew2;
182 x = (k+f)/2;
183 y = x-k;
184 /* first consider the diagonal */
185 if (match(&a->list[x-1], &b->list[y-1])) {
186 vnew = v[k];
187 update_value(&vnew, 0, k, x);
188 #if 0
189 printf("new %d,%d %d,%d (%d) ...",
190 vnew.x, vy(vnew), x, y, value(vnew,k,x));
191 #endif
192 if (vnew.c < 0) abort();
193 if (vnew.val > best[vnew.c].val) {
194 #if 0
195 printf("New best for %d at %d,%d %d,%d, val %d\n",
196 vnew.c, vnew.x, vnew.y,x,y,vnew.val);
197 #endif
198 best[vnew.c].xlo = vnew.x;
199 best[vnew.c].ylo = vnew.y;
200 best[vnew.c].xhi = x;
201 best[vnew.c].yhi = y;
202 best[vnew.c].val = vnew.val;
204 v[k] = vnew;
205 } else {
206 vnew = v[k+1];
207 update_value(&vnew, -1, k,x);
208 /* might cross a chunk boundary */
209 if (b->list[y-1].len && b->list[y-1].start[0]==0) {
210 vnew.c = atoi(b->list[y-1].start+1);
211 vnew.val = 0;
213 vnew2 = v[k-1];
214 update_value(&vnew2, 1, k, x);
216 if (vnew2.val > vnew.val)
217 v[k] = vnew2;
218 else
219 v[k] = vnew;
222 /* extend or contract range */
223 klo--;
224 v[klo] = v[klo+1];
225 x = (klo+f)/2; y = x-klo;
226 update_value(&v[klo],-1,klo,x);
227 if (y<=bhi && b->list[y-1].len && b->list[y-1].start[0]==0) {
228 v[klo].c = atoi(b->list[y-1].start+1);
229 #if 0
230 printf("entered %d at %d,%d\n", v[klo].c, x, y);
231 #endif
232 v[klo].val = 0;
234 while (klo+2 < (ahi-bhi) &&
235 (y > bhi ||
236 (best_val(&v[klo], min(ahi-x,bhi-y)) < best[v[klo].c].val &&
237 best_val(&v[klo+1], min(ahi-x,bhi-y+1)) < best[v[klo+1].c].val
239 )) {
240 klo+=2;
241 x = (klo+f)/2; y = x-klo;
244 khi++;
245 v[khi] = v[khi-1];
246 x = (khi+f)/2; y = x - khi;
247 update_value(&v[khi],-1,khi,x);
248 while(khi-2 > (ahi-bhi) &&
249 (x > ahi ||
250 (best_val(&v[khi], min(ahi-x,bhi-y)) < best[v[khi].c].val &&
251 best_val(&v[khi-1], min(ahi-x+1,bhi-y)) < best[v[khi].c].val
253 )) {
254 khi -= 2;
255 x = (khi+f)/2; y = x - khi;
259 free(valloc);
262 struct csl *csl_join(struct csl *c1, struct csl *c2)
264 struct csl *c,*cd, *rv;
265 int cnt;
266 if (c1 == NULL)
267 return c2;
268 if (c2 == NULL)
269 return c1;
271 cnt = 1; /* the sentinal */
272 for (c=c1; c->len; c++) cnt++;
273 for (c=c2; c->len; c++) cnt++;
274 cd = rv = malloc(sizeof(*rv)*cnt);
275 for (c=c1; c->len; c++)
276 *cd++ = *c;
277 for (c=c2; c->len; c++)
278 *cd++ = *c;
279 cd->len = 0;
280 free(c1);
281 free(c2);
282 return rv;
285 #if 0
286 static void printword(struct elmnt e)
288 if (e.start[0])
289 printf("%.*s", e.len, e.start);
290 else {
291 int a,b,c;
292 sscanf(e.start+1, "%d %d %d", &a, &b, &c);
293 printf("*** %d,%d **** %d\n", b,c,a);
296 #endif
299 * reduce a file by discarding less interesting words
300 * Words that end with a newline are interesting (so all words
301 * in line-mode are interesting) and words that start with
302 * and alphanumeric are interesting. This excludes spaces and
303 * special characters in word mode
304 * Doing a best-fit comparision on only interesting words is
305 * much fast than on all words, and it nearly as good
308 static inline int is_skipped(struct elmnt e)
310 return !( ends_line(e) ||
311 isalnum(e.start[0]) ||
312 e.start[0] == '_');
314 struct file reduce(struct file orig)
316 int cnt=0;
317 int i;
318 struct file rv;
320 for (i=0; i<orig.elcnt; i++)
321 if (!is_skipped(orig.list[i]))
322 cnt++;
324 if (cnt == orig.elcnt) return orig;
326 rv.elcnt = cnt;
327 rv.list = malloc(cnt*sizeof(struct elmnt));
328 cnt = 0;
329 for (i=0; i<orig.elcnt; i++)
330 if (!is_skipped(orig.list[i]))
331 rv.list[cnt++] = orig.list[i];
332 return rv;
335 /* Given a list of best matches between a1 and b1 which are
336 * subsets of a2 and b2, convert that list to indexes into a2/b2
338 * When we find the location in a2/b2, we expand to include all
339 * immediately surrounding words which were skipped
341 void remap(struct best *best, int cnt,
342 struct file a1, struct file b1,
343 struct file a2, struct file b2)
345 int b;
346 int pa,pb;
347 pa=pb=0;
349 if (a1.elcnt == 0 && a2.elcnt == 0) return;
351 for (b=1; b<cnt; b++)
352 if (best[b].val>0) {
353 #if 0
354 printf("best %d,%d %d,%d\n",
355 best[b].xlo,best[b].ylo,
356 best[b].xhi,best[b].yhi);
357 #endif
358 while (pa<a2.elcnt &&
359 a2.list[pa].start != a1.list[best[b].xlo].start)
360 pa++;
361 if (pa == a2.elcnt) abort();
362 while (pb<b2.elcnt &&
363 b2.list[pb].start != b1.list[best[b].ylo].start)
364 pb++;
365 if (pb == b2.elcnt) abort();
367 /* pa,pb is the start of this best bit. Step
368 * backward over ignored words
370 while (pa>0 && is_skipped(a2.list[pa-1]))
371 pa--;
372 while (pb>0 && is_skipped(b2.list[pb-1]))
373 pb--;
375 #if 0
376 printf("-> %d,%d\n", pa,pb);
377 #endif
378 best[b].xlo = pa;
379 best[b].ylo = pb;
381 while (pa<a2.elcnt &&
382 a2.list[pa-1].start != a1.list[best[b].xhi-1].start)
383 pa++;
384 if (pa == a2.elcnt && best[b].xhi != a1.elcnt) abort();
385 while (pb<b2.elcnt &&
386 b2.list[pb-1].start != b1.list[best[b].yhi-1].start)
387 pb++;
388 if (pb == b2.elcnt && best[b].yhi != b1.elcnt) abort();
390 /* now step pa,pb forward over ignored words */
391 while (pa<a2.elcnt && is_skipped(a2.list[pa]))
392 pa++;
393 while (pb<b2.elcnt && is_skipped(b2.list[pb]))
394 pb++;
395 #if 0
396 printf("-> %d,%d\n", pa,pb);
397 #endif
398 best[b].xhi = pa;
399 best[b].yhi = pb;
403 static void find_best_inorder(struct file *a, struct file *b,
404 int alo, int ahi, int blo, int bhi,
405 struct best *best, int bestlo, int besthi)
407 /* make sure the best matches we find are inorder.
408 * If they aren't we find a overall best, and
409 * recurse either side of that
411 int i;
412 int bad=0;
413 int bestval, bestpos=0;
414 for (i=bestlo; i<besthi; i++) best[i].val = 0;
415 find_best(a,b,alo,ahi,blo,bhi,best);
416 for (i=bestlo+1; i<besthi; i++)
417 if (best[i-1].val > 0 &&
418 best[i].val > 0 &&
419 best[i-1].xhi >= best[i].xlo)
420 bad = 1;
422 if (!bad)
423 return;
424 bestval = 0;
425 for (i=bestlo; i<besthi; i++)
426 if (best[i].val > bestval) {
427 bestval = best[i].val;
428 bestpos = i;
430 if (bestpos > bestlo) {
431 /* move top down below chunk marker */
432 int y = best[bestpos].ylo;
433 while (b->list[y].start[0]) y--;
434 find_best_inorder(a,b,
435 alo, best[bestpos].xlo,
436 blo, y,
437 best, bestlo, bestpos);
439 if (bestpos < besthi-1) {
440 /* move bottom up to chunk marker */
441 int y = best[bestpos].yhi;
442 while (b->list[y].start[0]) y++;
443 find_best_inorder(a,b,
444 best[bestpos].xhi, ahi,
445 y, bhi,
446 best, bestpos+1, besthi);
450 struct csl *pdiff(struct file a, struct file b, int chunks)
452 int alo,ahi,blo,bhi;
453 struct csl *csl1, *csl2;
454 struct best *best = malloc(sizeof(struct best)*(chunks+1));
455 int i;
456 struct file asmall, bsmall;
458 asmall = reduce(a);
459 bsmall = reduce(b);
461 alo = blo = 0;
462 ahi = asmall.elcnt;
463 bhi = bsmall.elcnt;
464 /* printf("start: %d,%d %d,%d\n", alo,blo,ahi,bhi); */
466 for (i=0; i<chunks+1; i++)
467 best[i].val = 0;
468 find_best_inorder(&asmall,&bsmall,
469 0, asmall.elcnt, 0, bsmall.elcnt,
470 best, 1, chunks+1);
471 #if 0
472 /* for(i=0; i<b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
473 for (i=1; i<=chunks; i++) {
474 printf("end: %d,%d %d,%d\n", best[i].xlo,best[i].ylo,best[i].xhi,best[i].yhi);
475 printf("<"); printword(bsmall.list[best[i].ylo]); printf("><");
476 printword(bsmall.list[best[i].yhi-1]);printf(">\n");
478 #endif
479 remap(best,chunks+1,asmall,bsmall,a,b);
480 #if 0
481 /* for(i=0; i<b.elcnt;i++) { printf("%d: ", i); printword(b.list[i]); }*/
482 for (i=1; i<=chunks; i++)
483 printf("end: %d,%d %d,%d\n", best[i].xlo,best[i].ylo,best[i].xhi,best[i].yhi);
484 printf("small: a %d b %d --- normal: a %d b %d\n", asmall.elcnt, bsmall.elcnt, a.elcnt, b.elcnt);
485 #endif
487 csl1 = NULL;
488 for (i=1; i<=chunks; i++)
489 if (best[i].val>0) {
490 #if 0
491 int j;
492 printf("Before:\n");
493 for (j=best[i].xlo; j<best[i].xhi; j++)
494 printword(a.list[j]);
495 printf("After:\n");
496 for (j=best[i].ylo; j<best[i].yhi; j++)
497 printword(b.list[j]);
498 #endif
499 csl2 = diff_partial(a,b,
500 best[i].xlo,best[i].xhi,
501 best[i].ylo,best[i].yhi);
502 csl1 = csl_join(csl1,csl2);
504 if (csl1) {
505 for (csl2=csl1; csl2->len; csl2++);
506 csl2->a = a.elcnt;
507 csl2->b = b.elcnt;
508 } else {
509 csl1 = malloc(sizeof(*csl1));
510 csl1->len = 0;
511 csl1->a = a.elcnt;
512 csl1->b = b.elcnt;
514 free(best);
515 return csl1;