summaryrefslogtreecommitdiffstats
path: root/examples/folder_tree.py
blob: 695d6efa329cbc325b792d50f33ece741149f161 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python
# A file folder scanner contributed by @holger
#
# You can spicify the scanned folder and file pattern by changing rootPath
# and pattern variables
#

__author__ = "holger"

from treelib import tree

import fnmatch
import os
import zlib
import argparse

DEBUG = 0
FILECOUNT = 0
DIRCOUNT = 0
DIR_ERRORLIST = []
FILE_ERRORLIST = []


# Time Profiling
PROFILING = 0
# 0 - nothing
# 1 - time
# 2 - cProfile

if PROFILING == 1:
    import timeit
if PROFILING == 2:
    import cProfile


parser = argparse.ArgumentParser(
    description="Scan the given folder and print its structure in a tree."
)
parser.add_argument("abspath", type=str, help="An absolute path to be scanned.")
parser.add_argument(
    "pattern", type=str, help="File name pattern to filtered, e.g. *.pdf"
)

args = parser.parse_args()
rootPath = args.abspath
pattern = args.pattern

folder_blacklist = []

dir_tree = tree.Tree()
dir_tree.create_node("Root", rootPath)  # root node


def crc32(data):
    data = bytes(data, "UTF-8")

    if DEBUG:
        print("++++++ CRC32 ++++++")
        print("input: " + str(data))
        print("crc32: " + hex(zlib.crc32(data) & 0xFFFFFFFF))
        print("+++++++++++++++++++")
    return hex(
        zlib.crc32(data) & 0xFFFFFFFF
    )  # crc32 returns a signed value, &-ing it will match py3k


parent = rootPath
i = 1

# calculating start depth
start_depth = rootPath.count("/")


def get_noteid(depth, root, dir):
    """get_noteid returns
    - depth contains the current depth of the folder hierarchy
    - dir contains the current directory

    Function returns a string containing the current depth, the folder name and unique ID build by hashing the
    absolute path of the directory. All spaces are replaced by '_'

    <depth>_<dirname>+++<crc32>
    e.g. 2_Folder_XYZ_1+++<crc32>
    """
    return (
        str(str(depth) + "_" + dir).replace(" ", "_")
        + "+++"
        + crc32(os.path.join(root, dir))
    )


# TODO: Verzeichnistiefe pruefen: Was ist mit sowas /mp3/


def get_parentid(current_depth, root, dir):
    # special case for the 'root' of the tree
    # because we don't want a cryptic root-name
    if current_depth == 0:
        return root

    # looking for parent directory
    # e.g. /home/user1/mp3/folder1/parent_folder/current_folder
    # get 'parent_folder'

    search_string = os.path.join(root, dir)
    pos2 = search_string.rfind("/")
    pos1 = search_string.rfind("/", 0, pos2)
    parent_dir = search_string[pos1 + 1 : pos2]  # noqa: E203
    parentid = (
        str(current_depth - 1)
        + "_"
        + parent_dir.replace(" ", "_")
        + "+++"
        + crc32(root)
    )
    return parentid
    # TODO: catch error


def print_node(dir, node_id, parent_id):
    print("#############################")
    print("node created")
    print("      dir:     " + dir)
    print("      note_id: " + node_id)
    print("      parent:  " + parent_id)


def crawler():
    global DIRCOUNT
    global FILECOUNT

    for root, dirs, files in os.walk(rootPath):
        # +++ DIRECTORIES +++
        for dir in dirs:
            # calculating current depth
            current_depth = os.path.join(root, dir).count("/") - start_depth

            if DEBUG:
                print("current: " + os.path.join(root, dir))

            node_id = get_noteid(current_depth, root, dir)
            parent_id = str(get_parentid(current_depth, root, dir))

            if parent_id == str(None):
                DIR_ERRORLIST.append(os.path.join(root, dir))

            if DEBUG:
                print_node(dir, node_id, parent_id)

            # create node
            dir_tree.create_node(dir, node_id, parent_id)
            DIRCOUNT += 1

        # +++ FILES +++
        for filename in fnmatch.filter(files, pattern):
            if dir in folder_blacklist:
                continue

            # calculating current depth
            current_depth = os.path.join(root, filename).count("/") - start_depth

            if DEBUG:
                print("current: " + os.path.join(root, filename))

            node_id = get_noteid(current_depth, root, filename)
            parent_id = str(get_parentid(current_depth, root, filename))

            if parent_id == str(None):
                FILE_ERRORLIST.append(os.path.join(root, dir))

            if DEBUG:
                print_node(filename, node_id, parent_id)

            # create node
            dir_tree.create_node(filename, node_id, parent_id)
            FILECOUNT += 1


if PROFILING == 0:
    crawler()
if PROFILING == 1:
    t1 = timeit.Timer("crawler()", "from __main__ import crawler")
    print("time:      " + str(t1.timeit(number=1)))
if PROFILING == 2:
    cProfile.run("crawler()")


print("filecount: " + str(FILECOUNT))
print("dircount:  " + str(DIRCOUNT))

if DIR_ERRORLIST:
    for item in DIR_ERRORLIST:
        print(item)
else:
    print("no directory errors")

print("\n\n\n")

if FILE_ERRORLIST:
    for item in FILE_ERRORLIST:
        print(item)
else:
    print("no file errors")

print("nodes: " + str(len(dir_tree.nodes)))

dir_tree.show()