diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..1ff03788 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.metadata diff --git a/221801132/README.md b/221801132/README.md new file mode 100644 index 00000000..8c02ae79 --- /dev/null +++ b/221801132/README.md @@ -0,0 +1,48 @@ +WordCount +--------------- + + +**基本功能** + +假设有一个软件每隔一小段时间会记录一次用户的搜索记录,记录为英文。 +输入文件和输出文件以命令行参数传入。例如我们在命令行窗口(cmd)中输入: + + +>java WordCount input.txt output.txt + +则会统计input.txt中的以下几个指标 + +**1、统计文件的字符数(对应输出第一行):** + + - 只需要统计Ascii码,汉字不需考虑 + - 空格,水平制表符,换行符,均算字符 + + + +**2、统计文件的单词总数(对应输出第二行),单词:至少以4个英文字母开头,跟上字母数字符号,单词以分隔符分割,不区分大小写。** + + - 英文字母: A-Z,a-z + - 字母数字符号:A-Z, a-z,0-9 + - 分割符:空格,非字母数字符号 + - 例:file123是一个单词, 123file不是一个单词。file,File和FILE是同一个单词 + + + +**3、统计文件的有效行数(对应输出第三行):任何包含非空白字符的行,都需要统计。** + +**4、统计文件中各单词的出现次数(对应输出接下来10行),最终只输出频率最高的10个。** + + - 频率相同的单词,优先输出字典序靠前的单词。 + +>例如,windows95,windows98和windows2000同时出现时,则先输出windows2000 + + - 输出的单词统一为小写格式 + +**然后将统计结果输出到output.txt,输出的格式如下;其中word1和word2 对应具体的单词,number为统计出的个数;换行使用'\n',编码统一使用UTF-8。** + +>characters: number +words: number +lines: number +word1: number +word2: number +... diff --git a/221801132/codestyle.md b/221801132/codestyle.md new file mode 100644 index 00000000..f63e22cd --- /dev/null +++ b/221801132/codestyle.md @@ -0,0 +1,51 @@ +代码规范 +---- + + +---------- + +**缩进** + + - 4个空格 + +**每行最多字符数** + + - 80字符 + +**函数最大行数** + + - 100行 + + +**函数、类命名** + + - 类名使用UpperCamelCase风格,必须遵从驼峰形式。 + - 命名尽量使用英文单词,力求简单清楚 + +**变量命名** + + - 且尽量使用单词命名,一律小写 + - 禁止取单个字符(如i、j、k),但 i、j、k作局部循环变量是允许的。 + +**常量** + + - 全部大写 + - 不允许未经定义的常量直接出现在代码中 + +**空行规则** + + - 相对独立的程序块之间、变量说明之后必须加空行 + - 不允许把多个短语句写在一行中,一行只写一条语句 + - if、for、do、while、case、switch、default 等语句自占一行,且if、for、do、while等语句的执行语句部分无论多少都要加括号{}。 + +**注释规则** + + - 使用// + - 注释的内容要清楚、明了,不能有二义性 + - 操作符前后空格 操作符前后必须加一个空格 + +**其他规则** + + - 严禁使用拼音与英文混合的方式,更不允许直接使用中文的方式 + - 用大写的’L’代替’l’ + diff --git a/221801132/src/Lib.java b/221801132/src/Lib.java new file mode 100644 index 00000000..19c52bea --- /dev/null +++ b/221801132/src/Lib.java @@ -0,0 +1,173 @@ +package WordCount; + +import java.io.*; +import java.util.*; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class Lib { + public static Reader InputFile(String fileName) { + File file = new File(fileName); + Reader reader = null; + try { + reader = new InputStreamReader(new FileInputStream(file)); + } catch (FileNotFoundException e) { + System.out.println("找不到输入文件!"); + } + return reader; + } + + public static BufferedWriter OutputFile(String fileName) throws IOException { + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(fileName),true),"utf-8")); + return write; + } + + public static int CountCharacters(String InputFile, String OutputFile) throws IOException { + Reader reader = InputFile(InputFile); + Writer writer = new FileWriter(OutputFile); + int CharactersNum = 0; //文件字符数 + while (reader.read() != -1) //读取到-1时停止。 + { + CharactersNum++; + } + writer.write("characters:" + CharactersNum + '\n'); + writer.close(); + reader.close(); + return CharactersNum; + } // 统计字符数。 + + public static int CountWords(String inputFile, String outputFile) throws IOException { + Reader reader = InputFile(inputFile); + Writer writer = OutputFile(outputFile); + int length; //表示单词的长度,大于等于4合法 + int temp; + int WordsNum = 0; + String word = ""; + String regex = "[a-zA-Z]{4}[^ ,.]+"; //正则表达式判断是否为四个英文开头 + Pattern p = Pattern.compile(regex); + while ((temp = reader.read()) != -1) + { + while ((temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) { + word += (char) temp; + temp = reader.read(); + } + while ((!(temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) && temp != -1) //去除空白字符和分隔符 + { + temp = reader.read(); + } + Matcher m = p.matcher(word); + length = word.length(); + if (length >= 4 && m.matches()) + { + WordsNum++; + } + word = "" + (char)temp; + } + writer.append("words: " + WordsNum + '\n'); + writer.close(); + reader.close(); + return WordsNum; + } //统计单词数 + + + public static int CountLines(String inputFile, String outputFile) throws IOException { + Reader reader = InputFile(inputFile); + Writer writer = OutputFile(outputFile); + int temp; + int LinesNum = 0; + String line = ""; + while ((temp = reader.read()) != -1) + { + while (temp != -1 && (char) temp != '\n') + { + if (temp != ' ' && temp != '\t' && temp != '\r') + { + line += (char)temp; + } + temp = reader.read(); + } + if (line != " ") + { + LinesNum++; + } + line = " "; + } + writer.append("lines:" + LinesNum + "\n"); + reader.close(); + writer.close(); + return LinesNum; + } //统计行数。 + + public static String WordsNumSort(String inputFile, String outputFile) throws IOException { + Reader reader = InputFile(inputFile); + Writer writer = OutputFile(outputFile); + int temp; + String word = ""; + String regex = "[a-zA-Z]{4}[^ ,.]+"; //正则表达式判断是否为四个英文开头 + Pattern p = Pattern.compile(regex); + Map words = new HashMap(); + while ((temp = reader.read()) != -1) + { + while ((temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) + { + if (temp >= 65 && temp <= 90) + { + temp += 32; + } + word += (char)temp; + temp = reader.read(); + } + while ((!(temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) && temp != -1) + { + temp = reader.read(); + } + Matcher m = p.matcher(word); + if (m.matches()) { + if (words.get(word) == null) + { + words.put(word, Integer.valueOf(1)); + } + else + { + words.put(word, Integer.valueOf(words.get(word).intValue() + 1)); + } + } + if (temp >= 65 && temp <= 90) + { + temp += 32; + } + word = "" + (char) temp; + } //与统计单词数的方法类似,不合法的单词不进行排序。 + + Map WordsSort = words.entrySet().stream().sorted(new Comparator>() + { + public int compare(Map.Entry w1, Map.Entry w2) + { + if (w1.getValue().equals(w2.getValue())) + { + return w1.getKey().compareTo(w2.getKey()); + } + else + { + return w2.getValue().compareTo(w1.getValue()); + } + } + } + ).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,(oldValue, newValue) -> oldValue, LinkedHashMap::new)); //对单词频率进行排序 + String test = null; //设定频率最低的词以便单元测试 + int i = 0; + for (Map.Entry entry : WordsSort.entrySet()) + { + test = entry.getKey(); + writer.write(entry.getKey() + ":" + entry.getValue() + "\n"); + if (i++ >= 9) { + break; + } + } ////打印频率前十的单词 + reader.close(); + writer.close(); + return test; //返回频率最低的词 + } +} \ No newline at end of file diff --git a/221801132/src/WordCount.java b/221801132/src/WordCount.java new file mode 100644 index 00000000..5698b7c7 --- /dev/null +++ b/221801132/src/WordCount.java @@ -0,0 +1,24 @@ +package WordCount; + +import java.io.*; +import java.util.*; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class WordCount { + public static void main(String[] args) throws IOException { + if (args.length != 2) + { + System.out.println("命令行参数错误,需要两个文件名!"); + System.exit(0); + } + String inputFile = args[0]; + String outputFile = args[1]; + Lib.CountCharacters(inputFile, outputFile); + Lib.CountWords(inputFile, outputFile); + Lib.CountLines(inputFile, outputFile); + Lib.WordsNumSort(inputFile, outputFile); + } +}