Situation: I have two computers with a large number of files on them (approximately 250 million files on each machine). I need to sync them up and rsync is not an option because it takes way too long. So, I need to ‘diff’ the files on the two machines. I did a ‘find’ on both machines to a file. These files turned out to be about 15GB each, but the file size was too large to just ‘diff’ because diff wants to read everything into memory:
[root@fs105 tmp]# diff imagelist_image01.txt imagelist.txt
diff: memory exhausted
Solution: sort the files manually first, then use the ‘comm’ command to find the differences
[root@fs105 tmp]# ls -la
total 27935834
drwxr-xr-x 2 root root 120 Oct 6 10:18 .
drwxr-xr-x 8 root root 192 Oct 3 13:56 ..
-rw-r--r-- 1 root root 13859131915 Oct 6 10:13 imagelist_image01.txt
-rw-r--r-- 1 root root 14719246513 Oct 3 14:02 imagelist.txt
[root@fs105 tmp]# sort -S 2G -T . imagelist.txt > imagelist_image02_sorted.txt ; sort -S 2G -T . imagelist_image01.txt > imagelist_image01_sorted.txt
[root@fs105 tmp]# comm -3 imagelist_image01_sorted.txt imagelist_image02_sorted.txt > diff.txt
[root@fs105 tmp]# ls -lah ; wc -l diff.txt
total 55G
drwxr-xr-x 2 root root 240 Oct 6 14:16 .
drwxr-xr-x 8 root root 192 Oct 3 13:56 ..
-rw-r--r-- 1 root root 895M Oct 6 15:09 diff.txt
-rw-r--r-- 1 root root 13G Oct 6 14:11 imagelist_image01_sorted.txt
-rw-r--r-- 1 root root 13G Oct 6 10:13 imagelist_image01.txt
-rw-r--r-- 1 root root 14G Oct 6 12:25 imagelist_image02_sorted.txt
-rw-r--r-- 1 root root 14G Oct 3 14:02 imagelist.txt
17487092 diff.txt
