Reputation: 6218
i have an issue i could use some help with, i have python list that looks like this:
fail = [
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\b\\include', 'Test.java']
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\c', 'apa1.txt']
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'knark.txt']
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\d', 'Sourcecheck.py']
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\a\\include', 'svin.txt']
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\a', 'apa2.txt']
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'apa.txt']
sha1 value, directory, filename
What i want is to separate this content in two different lists based on the sha1 value and directory. For example.
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'apa.txt']
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'knark.txt']
i want to add to the list duplicate = []
, because it's in the same directory with the same sha1 value (and only that directory). The rest of the entries i want to add to another list, say diff = []
, because the sha1 value is the same but the directories differ.
I'm kinda lost with the logic here so all that help i could get would be thankful!
EDIT: Fixed a typo, last value (filename) was in some cases a 1-list element, which was 100% incorrect, thansk to SilentGhost to become aware of this issue.
Upvotes: 1
Views: 1065
Reputation: 5807
you could simply loop through all the values then use an inner loop to compare directories, then if the directory is the same compare values, then assign lists. this would give you a decent n^2 algorithm to sort it out.
maybe like this untested code:
>>>for i in range(len(fail)-1):
... dir = fail[i][1]
... sha1 = fail[i][0]
... for j in range(i+1,len(fail)):
... if dir == fail[j][1]: #is this how you compare strings?
... if sha1 == fail[j][0]:
... #remove from fail and add to duplicate and add other to diff
again the code is untested.
Upvotes: 1
Reputation: 56572
I believe the accepted answer will be slightly more efficient (Python's internal sort should be faster than my dictionary walk), but since I already came up with this, I may as well post it. :-)
This technique uses a multilevel dictionary to avoid both sorting and explicit comparisons.
hashes = {}
diff = []
dupe = []
# build the dictionary
for sha, path, files in fail:
try:
hashes[sha][path].append(files)
except KeyError:
try:
hashes[sha][path] = [files]
except:
hashes[sha] = dict((path, [files]))
for sha, paths in hashes.iteritems():
if len(paths) > 1:
for path, files in paths.iteritems():
for file in files:
diff.append([sha, path, file])
for path, files in paths.iteritems():
if len(files) > 1:
for file in files:
dupe.append([sha, path, file])
The result will be:
diff = [
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\d', 'Sourcecheck.py'],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\b\\include', 'Test.java'],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\a\\include', ['svin.txt']],
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\a', ['apa2.txt']],
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\c', 'apa1.txt']
]
dupe = [
[['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'knark.txt'],
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', ['apa.txt']]
]
Upvotes: 0
Reputation: 8966
Here's another way to go at it using dictionaries to group by sha and directory. This also gets rid of the random lists in the file names.
new_fail = {} # {sha: {dir: [filenames]}}
for item in fail:
# split data into it's parts
sha, directory, filename = item
# make sure the correct elements exist in the data structure
if sha not in new_fail:
new_fail[sha] = {}
if directory not in new_fail[sha]:
new_fail[sha][directory] = []
# this is where the lists are removed from the file names
if type(filename) == type([]):
filename = filename[0]
new_fail[sha][directory].append(filename)
diff = []
dup = []
# loop through the data, analyzing it
for sha, val in new_fail.iteritems():
for directory, filenames in val.iteritems():
# check to see if the sha/dir combo has more than one file name
if len(filenames) > 1:
for filename in filenames:
dup.append([sha, directory, filename])
else:
diff.append([sha, dir, filenames[0]])
To print it:
print 'diff:'
for i in diff:
print i
print '\ndup:'
for i in dup:
print i
Sample data looks like this:
diff: ['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\d', 'Sourcecheck.py'] ['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\b\\include', 'Test.java'] ['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\a\\include', 'svin.txt'] ['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\a', 'apa2.txt'] ['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\c', 'apa1.txt'] dup: ['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'knark.txt'] ['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'apa.txt']
Upvotes: 0
Reputation: 667
In the following code sample, I use a key based on the SHA1 and directory name to detect unique and duplicate entries and spare dictionaries for housekeeping.
# Test dataset
fail = [
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\b\\include', 'Test.java'],
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\c', 'apa1.txt'],
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'knark.txt'],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\d', 'Sourcecheck.py'],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\a\\include', ['svin.txt']],
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\a', ['apa2.txt']],
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', ['apa.txt']],
]
def sort_duplicates(filelist):
"""Returns a tuplie whose first element is a list of unique files,
and second element is a list of duplicate files.
"""
diff = []
diff_d = {}
duplicate = []
duplicate_d = {}
for entry in filelist:
# Make an immutable key based on the SHA-1 and directory strings
key = (entry[0], entry[1])
# If this entry is a known duplicate, add it to the duplicate list
if key in duplicate_d:
duplicate.append(entry)
# If this entry is a new duplicate, add it to the duplicate list
elif key in diff_d:
duplicate.append(entry)
duplicate_d[key] = entry
# And relocate the matching entry to the duplicate list
matching_entry = diff_d[key]
duplicate.append(matching_entry)
duplicate_d[key] = matching_entry
del diff_d[key]
diff.remove(matching_entry)
# Otherwise add this entry to the different list
else:
diff.append(entry)
diff_d[key] = entry
return (diff, duplicate)
def test():
global fail
diff, dups = sort_duplicates(fail)
print "Diff:", diff
print "Dups:", dups
test()
Upvotes: 1
Reputation: 902
duplicate = []
# Sort the list so we can compare adjacent values
fail.sort()
#if you didn't want to modify the list in place you can use:
#sortedFail = sorted(fail)
# and then use sortedFail in the rest of the code instead of fail
for i, x in enumerate(fail):
if i+1 == len(fail):
#end of the list
break
if x[:2] == fail[i+1][:2]:
if x not in duplicate:
duplicate.add(x)
if fail[i+1] not in duplicate:
duplicate.add(fail[i+1])
# diff is just anything not in duplicate as far as I can tell from the explanation
diff = [d for d in fail if d not in duplicate]
With your example input
duplicate: [
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', ['apa.txt']],
['95d1543adea47e88923c3d4ad56e9f65c2b40c76', 'ron\\c', 'knark.txt']
]
diff: [
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\a', ['apa2.txt']],
['b5cc17d3a35877ca8b76f0b2e07497039c250696', 'ron\\c', 'apa1.txt'],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\a\\include', ['svin.txt']],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\b\\include', 'Test.java'],
['da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ron\\d', 'Sourcecheck.py']
]
So perhaps I missed something, but I think this is what you were asking for.
Upvotes: 3