Still more updates to 'p'
[wiggle.git] / diff.c
blobc48340a66791575fc8a4225539ca93467aaea251
1 /*
2 * wiggle - apply rejected patches
4 * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * Author: Neil Brown
22 * Email: <neilb@cse.unsw.edu.au>
23 * Paper: Neil Brown
24 * School of Computer Science and Engineering
25 * The University of New South Wales
26 * Sydney, 2052
27 * Australia
31 * calculate longest common sequence between two sequences
33 * Each sequence contains strings with
34 * hash start length
35 * We produce a list of tripples: a b len
36 * where A and B point to elements in the two sequences, and len is the number
37 * of common elements there
39 * To help keep matches close together (important when matching a changed fragment
40 * against a whole) we track the disagonal of the first and last match on any path.
41 * When choosing the best of two paths, we choose the furthest reaching unless
42 * the other has a match and it's absolute diagonal difference is significantly smaller.
43 * 'Significant' if the reduction in difference exceeds the loss of progress by a
44 * factor of 2.
48 #include <malloc.h>
49 #include "wiggle.h"
50 #include <string.h>
51 #include <stdlib.h>
54 struct v {
55 int x; /* x location of furthest reaching path of current cost */
56 int md; /* diagonal location of midline crossing */
57 int l; /* number of continuous common sequences found so far */
61 static int find_common(struct file *a, struct file *b,
62 int *alop, int *ahip,
63 int *blop, int *bhip,
64 int mid,
65 struct v *v)
67 /* examine matrix from alo to ahi and blo to bhi
68 * finding longest subsequence.
69 * return new {a,b}{lo,hi} either side of midline.
70 * i.e. alo+blo <= mid <= ahi+bhi
71 * and alo,blo to ahi,bhi is a common (possibly empty) subseq
73 * v is scratch space each is indexable from
74 * alo-bhi to ahi-blo inclusive
77 int klo, khi;
78 int k;
79 int alo = *alop;
80 int ahi = *ahip;
81 int blo = *blop;
82 int bhi = *bhip;
83 int x,y;
85 int best = (ahi-alo)+(bhi-blo);
86 int dist;
88 klo = khi = alo-blo;
89 v[klo].x = alo;
90 v[klo].l = 0;
92 while(1) {
94 for (k=klo ; k <= khi ; k+= 2) {
95 int snake = 0;
96 struct v vnew = v[k];
97 x = v[k].x;
98 y = x-k;
99 if (y > bhi) abort();
100 while (x < ahi && y < bhi &&
101 match(&a->list[x], &b->list[y])
103 x++;
104 y++;
105 snake=1;
107 vnew.x = x;
108 vnew.l += snake;
109 dist = (ahi-x)+(bhi-y);
110 if (dist < best) best = dist;
111 if (x+y >= mid &&
112 v[k].x+v[k].x-k <= mid) {
113 vnew.md = k;
115 v[k] = vnew;
117 if (dist == 0) {
118 /* OK! We have arrived.
119 * We crossed the midpoint at or after v[k].xm,v[k].ym
121 if (x != ahi) abort();
122 x = (v[k].md+mid)/2;
123 y = x-v[k].md;
124 *alop = x;
125 *blop = y;
127 while (x < ahi && y < bhi &&
128 match(&a->list[x], &b->list[y])
130 x++;
131 y++;
134 *ahip = x;
135 *bhip = y;
137 return k;
141 for (k=klo+1; k <= khi-1 ; k+= 2) {
142 if (v[k-1].x+1 >= v[k+1].x ) {
143 v[k] = v[k-1];
144 v[k].x++;
145 } else {
146 v[k] = v[k+1];
150 x = v[klo].x; y = x -(klo-1);
151 dist = abs((ahi-x)-(bhi-y));
152 if (dist <= best) {
153 v[klo-1] = v[klo];
154 klo --;
155 } else
156 while (dist > best) {
157 klo ++;
158 x = v[klo].x; y = x -(klo-1);
159 dist = abs((ahi-x)-(bhi-y));
162 x = v[khi].x+1; y = x - (khi+1);
163 dist = abs((ahi-x)-(bhi-y));
164 if (dist <= best) {
165 v[khi+1] = v[khi];
166 v[khi+1].x++;
167 khi ++;
168 } else
169 while (dist > best) {
170 khi --;
171 x = v[khi].x+1; y = x - (khi+1);
172 dist = abs((ahi-x)-(bhi-y));
177 static struct csl *lcsl(struct file *a, int alo, int ahi,
178 struct file *b, int blo, int bhi,
179 struct csl *csl,
180 struct v *v)
182 int len;
183 int alo1 = alo;
184 int ahi1 = ahi;
185 int blo1 = blo;
186 int bhi1 = bhi;
187 struct csl *rv = NULL;
188 int k;
190 if (ahi <= alo || bhi <= blo)
191 return csl;
194 k = find_common(a,b,
195 &alo1, &ahi1,
196 &blo1, &bhi1,
197 (ahi+bhi+alo+blo)/2,
199 if (k != ahi-bhi) abort();
201 len = v[k].l;
203 if (csl == NULL) {
204 rv = csl = malloc((len+1)*sizeof(*csl));
205 csl->len = 0;
207 if (len) {
208 csl = lcsl(a,alo,alo1,
209 b,blo,blo1,
210 csl, v);
212 if (ahi1 > alo1) {
213 /* need to add this common seq, possibly attach
214 * to last
216 if (csl->len &&
217 csl->a+csl->len == alo1 &&
218 csl->b+csl->len == blo1) {
219 csl->len += ahi1-alo1;
220 } else {
221 if (csl->len) csl++;
222 csl->len = ahi1-alo1;
223 csl->a = alo1;
224 csl->b = blo1;
225 csl[1].len = 0;
228 csl = lcsl(a,ahi1,ahi,
229 b,bhi1,bhi,
230 csl,v);
232 if (rv) {
233 if (csl->len)
234 csl++;
235 csl->a = ahi;
236 csl->b = bhi;
237 #if 1
238 if (rv+len != csl || csl->len != 0)
239 abort(); /* number of runs was wrong */
240 #endif
241 return rv;
242 } else
243 return csl;
246 /* if two common sequences are separated by only an add or remove,
247 * and the first common ends the same as the middle text,
248 * extend the second and contract the first in the hope that the
249 * first might become empty. This ameliorates against the greedyness
250 * of the 'diff' algorithm.
251 * Once this is done, repeat the process but extend the first
252 * in favour of the second. The acknowledges that semantic units
253 * more often end with common text ("return 0;\n}\n", "\n") than
254 * start with it.
256 static void fixup(struct file *a, struct file *b, struct csl *list)
258 struct csl *list1, *orig;
259 int lasteol = -1;
260 if (!list) return;
261 orig = list;
262 list1 = list+1;
263 while (list->len && list1->len) {
264 if ((list->a+list->len == list1->a &&
265 /* text at b inserted */
266 match(&b->list[list->b+list->len-1],
267 &b->list[list1->b-1])
270 (list->b+list->len == list1->b &&
271 /* text at a deleted */
272 match(&a->list[list->a+list->len-1],
273 &a->list[list1->a-1])
276 /* printword(a->list[list1->a-1]);
277 printf("fixup %d,%d %d : %d,%d %d\n",
278 list->a,list->b,list->len,
279 list1->a,list1->b,list1->len);
280 */ if (ends_line(a->list[list->a+list->len-1])
281 && a->list[list->a+list->len-1].len==1
282 && lasteol == -1
284 /* printf("E\n");*/
285 lasteol = list1->a-1;
287 list1->a--;
288 list1->b--;
289 list1->len++;
290 list->len--;
291 if (list->len == 0) {
292 lasteol = -1;
293 if (list > orig)
294 list--;
295 else {
296 *list = *list1++;
297 /* printf("C\n");*/
300 } else {
301 if (lasteol >= 0) {
302 /* printf("seek %d\n", lasteol);*/
303 while (list1->a <= lasteol && list1->len>1) {
304 list1->a++;
305 list1->b++;
306 list1->len--;
307 list->len++;
309 lasteol=-1;
311 *++list = *list1++;
314 list[1] = list1[0];
317 struct csl *diff(struct file a, struct file b)
319 struct v *v;
320 struct csl *csl;
321 v = malloc(sizeof(struct v)*(a.elcnt+b.elcnt+2));
322 v += b.elcnt+1;
324 csl = lcsl(&a, 0, a.elcnt,
325 &b, 0, b.elcnt,
326 NULL, v);
327 free(v-(b.elcnt+1));
328 fixup(&a, &b, csl);
329 if (!csl) {
330 csl = malloc(sizeof(*csl));
331 csl->len = 0;
332 csl->a = a.elcnt;
333 csl->b = b.elcnt;
335 return csl;
338 struct csl *diff_partial(struct file a, struct file b,
339 int alo, int ahi, int blo, int bhi)
341 struct v *v;
342 struct csl *csl;
343 v = malloc(sizeof(struct v)*(ahi-alo+bhi-blo+2));
344 v += bhi-alo+1;
346 csl = lcsl(&a, alo, ahi,
347 &b, blo, bhi,
348 NULL, v);
349 free(v-(bhi-alo+1));
350 fixup(&a, &b, csl);
351 return csl;
355 #ifdef MAIN
357 main(int argc, char *argv[])
359 struct file a, b;
360 struct csl *csl;
361 struct elmnt *lst = malloc(argc*sizeof(*lst));
362 int arg;
363 int alo, ahi, blo, bhi;
364 struct v *v;
365 int ln;
367 arg = 1;
368 a.elcnt = 0;
369 a.list = lst;
370 while (argv[arg] && strcmp(argv[arg],"--")) {
371 lst->hash = 0;
372 lst->start = argv[arg];
373 lst->len = strlen(argv[arg]);
374 a.elcnt++;
375 lst++;
376 arg++;
378 if (!argv[arg]) {
379 printf("AARGH\n");
380 exit(1);
382 arg++;
383 b.elcnt = 0;
384 b.list = lst;
385 while (argv[arg] && strcmp(argv[arg],"--")) {
386 lst->hash = 0;
387 lst->start = argv[arg];
388 lst->len = strlen(argv[arg]);
389 b.elcnt++;
390 lst++;
391 arg++;
394 v = malloc(sizeof(struct v)*(a.elcnt+b.elcnt+2));
395 v += b.elcnt+1;
396 alo = blo = 0;
397 ahi = a.elcnt;
398 bhi = b.elcnt;
399 #if 0
400 ln = find_common(&a, &b,
401 &alo, &ahi, &blo, &bhi,
402 (ahi+bhi)/2,
405 printf("ln=%d (%d,%d) -> (%d,%d)\n", ln,
406 alo,blo,ahi,bhi);
407 #else
408 csl = lcsl(&a, 0, a.elcnt,
409 &b, 0, b.elcnt,
410 NULL, v);
411 fixup(&a, &b, csl);
412 while (csl && csl->len) {
413 int i;
414 printf("%d,%d for %d:\n", csl->a,csl->b,csl->len);
415 for (i=0; i<csl->len; i++) {
416 printf(" %.*s (%.*s)\n",
417 a.list[csl->a+i].len, a.list[csl->a+i].start,
418 b.list[csl->b+i].len, b.list[csl->b+i].start);
420 csl++;
422 #endif
424 exit(0);
427 #endif