diff options
Diffstat (limited to 'utf8_count/utf8_count.lua')
-rw-r--r-- | utf8_count/utf8_count.lua | 139 |
1 files changed, 123 insertions, 16 deletions
diff --git a/utf8_count/utf8_count.lua b/utf8_count/utf8_count.lua index a196e9a..a1d0a4e 100644 --- a/utf8_count/utf8_count.lua +++ b/utf8_count/utf8_count.lua @@ -1,5 +1,7 @@ #!/usr/bin/lua +debug_mode = false + --check if version is supported --why 5.3? utf8 and 64bit support if (_VERSION ~= "Lua 5.3" ) then @@ -8,24 +10,110 @@ if (_VERSION ~= "Lua 5.3" ) then os.exit(1) end ---open file -utf8_file = io.open( "test/teabook.txt", "r" ) -if ( utf8_file == nil) then - print "Couldnot open file" - os.exit(1) +--if there is command line arguments then try to read and support them +--get list of files if there is such "-f [FILE] [FILE] ... " +list_ignore_chars = " :。,.][《》!?:" + +arg_files = {} +flag_input_files = false +flag_show_stats = false +flag_compact_stats = false +flag_ignore_chars = false + +function reset_flags() + flag_input_files = false end ---read contect in utf8 ---l = utf8_file:read("*l") -hier_table = {} -for line in utf8_file:lines() do - for p,c in utf8.codes( line ) do - if hier_table[c] == nil then - hier_table[c] = 1 - else - hier_table[c] = hier_table[c] + 1 +for k,v in ipairs( arg ) do + if ( v == "-f") then + flag_input_files = true + elseif ( v == "-s") then + reset_flags() + flag_show_stats = true + elseif ( v == "-c" ) then + reset_flags() + flag_compact_stats = true + elseif ( v == "-d" ) then + debug_mode = true + elseif ( v == "-i") then + reset_flags() + flag_ignore_chars = true + elseif ( v == "-h") then + reset_flags() + print(arg[-1]," [FLAGS] [FILE] ") + print( "-f [FILE] ... - list of file from with is used" ) + print( "-s - show char stat") + print( "-c - show compact char stat") + print( "-d - debug mode") + --print( "-i - ignore whitespaces, newlines") + print( "-h - just this help") + else + --not very nice way to parse all things but still + if ( (flag_input_files == true) and (v ~= "-f") ) then + table.insert( arg_files, v ) + end + end +end + +if debug_mode then + print( arg_files ) + for k,v in pairs(arg_files) do + print(k,v) + end +end + +function merge_tables( t1, t2 ) + local t = t1 + for k,v in pairs( t2 ) do + if ( t[k] == nil ) then + t[k] = v + elseif ( t[k] ~= nil ) then + t[k] = t[k] + v end end + return t +end + +function get_file_stat( filename ) + local hier_table = {} + --open file + utf8_file = io.open( filename, "r" ) + if ( utf8_file == nil) then + print "Couldnot open file" + os.exit(1) + end + for line in utf8_file:lines() do + for p,c in utf8.codes( line ) do + --get characters code and ignore by comparing integer value + --if flag_ignore_chars == true then + --print( c, utf8.char(c) ) + -- local ch = utf8.char(c) + --if string.match( list_ignore_chars, c ) then + -- if c == 12290 then + -- print("asdsad") + -- goto continue + -- end + --end + if hier_table[c] == nil then + hier_table[c] = 1 + else + hier_table[c] = hier_table[c] + 1 + end + ::continue:: + end + end + io.close( utf8_file ) + return hier_table +end + + + +--read contect in utf8 +--l = utf8_file:read("*l") +hier_table = {} +for k,fn in ipairs(arg_files) do + local char_stats = get_file_stat( fn ) + hier_table = merge_tables( hier_table, char_stats ) end hier_table_sorted = {} @@ -33,13 +121,32 @@ for k,v in pairs(hier_table) do table.insert( hier_table_sorted, {ch=k,val=v}) end +--sort table chars function cmpa( a,b ) if a.val ~= nil and b.val ~= nil then return a.val > b.val end end table.sort( hier_table_sorted, cmpa ) -for k,v in ipairs(hier_table_sorted) do - print(utf8.char(v.ch), "=",hier_table[v.ch] ) + +if ( flag_show_stats == true ) then + for k,v in pairs(hier_table_sorted) do + print(utf8.char(v.ch),"=", v.val) + end end +--combined with "-s" should be counter as bug? +if ( flag_compact_stats == true ) then + local new_val = 0 + for k,v in pairs( hier_table_sorted ) do + if v.val ~= new_val then + new_val = v.val + print("") + print(v.val) + io.write(utf8.char(v.ch)," ") + else + io.write(utf8.char(v.ch)," ") + end + end + print("") +end |