summaryrefslogtreecommitdiff
path: root/utf8_count/utf8_count.lua
blob: a1d0a4e91a21a331ccc2be707e9d1b3751eb8713 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/lua

debug_mode = false

--check if version is supported
--why 5.3? utf8 and 64bit support
if (_VERSION ~= "Lua 5.3" ) then
	print "Wrong lua versions"
	print "Supported 5.3 only"
	os.exit(1)
end

--if there is command line arguments then try to read and support them
--get list of files if there is such "-f [FILE] [FILE] ... "
list_ignore_chars = " :。,.][《》!?:"

arg_files = {}
flag_input_files = false
flag_show_stats = false
flag_compact_stats = false
flag_ignore_chars = false

function reset_flags()
	flag_input_files = false
end

for k,v in ipairs( arg ) do
	if ( v == "-f") then
		flag_input_files = true
	elseif ( v == "-s") then
		reset_flags()
		flag_show_stats = true
	elseif ( v == "-c" ) then
		reset_flags()
		flag_compact_stats = true
	elseif ( v == "-d" ) then
		debug_mode = true
	elseif ( v == "-i") then
		reset_flags()
		flag_ignore_chars = true
	elseif ( v == "-h") then
		reset_flags()
		print(arg[-1]," [FLAGS] [FILE] ")
		print( "-f [FILE] ... - list of file from with is used" )
		print( "-s            - show char stat")
		print( "-c            - show compact char stat")
		print( "-d            - debug mode")
		--print( "-i            - ignore whitespaces,  newlines")
		print( "-h            - just this help")
	else
		--not very nice way to parse all things but still
		if ( (flag_input_files == true) and (v ~= "-f") ) then
			table.insert( arg_files, v )
		end
	end
end

if debug_mode then
	print( arg_files )
	for k,v in pairs(arg_files) do
		print(k,v)
	end
end

function merge_tables( t1, t2 )
	local t = t1
	for k,v in pairs( t2 ) do
		if ( t[k] == nil ) then
			t[k] = v
		elseif ( t[k] ~= nil ) then
			t[k] = t[k] + v
		end
	end
	return t
end

function get_file_stat( filename )
	local hier_table = {}
	--open file
	utf8_file = io.open( filename, "r" )
	if ( utf8_file == nil) then
		print "Couldnot open file"
		os.exit(1)
	end
	for line in utf8_file:lines() do
		for p,c in utf8.codes( line ) do 
			--get characters code and ignore by comparing integer value
			--if flag_ignore_chars == true then
				--print( c, utf8.char(c) )
			--	local ch = utf8.char(c)
				--if string.match( list_ignore_chars, c ) then
			--	if c == 12290 then
			--		print("asdsad")
			--		goto continue
			--	end
			--end
			if hier_table[c] == nil then
				hier_table[c] = 1
			else
				hier_table[c] = hier_table[c] + 1
			end
			::continue::
		end
	end
	io.close( utf8_file )
	return hier_table
end



--read contect in utf8
--l = utf8_file:read("*l")
hier_table = {} 
for k,fn in ipairs(arg_files) do
	local char_stats = get_file_stat( fn )
	hier_table = merge_tables( hier_table, char_stats )
end

hier_table_sorted = {}
for k,v in pairs(hier_table) do
	table.insert( hier_table_sorted, {ch=k,val=v})
end

--sort table chars
function cmpa( a,b )
	if a.val ~= nil and b.val ~= nil then 
		return a.val > b.val
	end
end
table.sort( hier_table_sorted, cmpa )

if ( flag_show_stats == true ) then
	for k,v in pairs(hier_table_sorted) do
		print(utf8.char(v.ch),"=",	v.val)
	end
end

--combined with "-s" should be counter as bug?
if ( flag_compact_stats == true ) then
	local new_val = 0
	for k,v in pairs( hier_table_sorted ) do
		if v.val ~= new_val then 
			new_val = v.val
			print("")
			print(v.val)
			io.write(utf8.char(v.ch)," ")
		else
			io.write(utf8.char(v.ch)," ")
		end
	end
	print("")
end