I have written a short concurrent Java program that does all the work for you, so all that you are required to do is specify the fileshare (Args[1]) and the location of your seed.txt file, in my case nutch.txt (Args[0]).
This is the code for fileCrawler.java:
package FileName; import java.io.File; import java.io.PrintWriter; import java.util.ArrayList; public class fileCrawler { private WorkQueue workQ; static int i = 0; public static PrintWriter out; private class Worker implements Runnable { private WorkQueue queue; public Worker(WorkQueue q) { queue = q; } // since main thread has placed all directories into the workQ, we // know that all of them are legal directories; therefore, do not need // to try ... catch in the while loop below public void run() { String name; while ((name = queue.remove()) != null) { File file = new File(name); String entries[] = file.list(); if (entries == null) continue; for (String entry : entries) { if (entry.compareTo(".") == 0) continue; if (entry.compareTo("..") == 0) continue; String fn = name + "/" + entry; if (fn.contains("'")){ if (fn.startsWith("//")){ out.println("file://" + fn.replaceAll(" ", "%20")); //Get rid of all spaces System.out.println("file://" + fn.replaceAll(" ", "%20")); } else{ out.println("file:/" + fn.replaceAll(" ", "%20")); //Get rid of all spaces System.out.println("file:/" + fn.replaceAll(" ", "%20")); } } } } } } public fileCrawler() { workQ = new WorkQueue(); } public Worker createWorker() { return new Worker(workQ); } // need try ... catch below in case the directory is not legal public void processDirectory(String dir) { try { File file = new File(dir); if (file.isDirectory()) { String entries[] = file.list(); if (entries != null) workQ.add(dir); for (String entry : entries) { String subdir; if (entry.compareTo(".") == 0) continue; if (entry.compareTo("..") == 0) continue; if (dir.endsWith("/")) subdir = dir+entry; else subdir = dir+"/"+entry; processDirectory(subdir); } } } catch (Exception e) {}; } public static void main(String Args[]) { fileCrawler fc = new fileCrawler(); //now start all of the worker threads System.out.println("Starting new File Crawler on " + Args[1]); int N = 5; ArrayList<Thread> thread = new ArrayList<Thread>(N); for (int i = 0; i < N; i++) { Thread t = new Thread(fc.createWorker()); thread.add(t); t.start(); } //File to be written to //@throws FileNotFoundException try { out = new PrintWriter(Args[0]); } catch(Exception e){ System.err.println("File Not Found: " + Args[0]); } //Directory to be crawled String a = Args[1]; fc.processDirectory(a); //indicate that there are no more directories to add fc.workQ.finish(); //Finally add the directory so that it can be crawled if (a.startsWith("//")){ System.out.println("Adding: file://"+a); out.println("file://"+a); } else{ System.out.println("Adding: file:/"+a); out.println("file:/"+a); } System.out.println("Closing File"); out.close(); //Kill the final threads for (int i = 0; i < N; i++){ try { thread.get(i).join(); } catch (Exception e) {}; } System.out.println("Completed"); } }This is the code for WorkQueue.java:
package FileName; import java.util.*; public class WorkQueue { // // since we are providing the concurrency control, can use non-thread-safe // linked list // private LinkedList<String> workQ; private boolean done; // no more directories to be added private int size; // number of directories in the queue public WorkQueue() { workQ = new LinkedList<String>(); done = false; size = 0; } public synchronized void add(String s) { workQ.add(s); size++; notifyAll(); } public synchronized String remove() { String s; while (!done && size == 0) { try { wait(); } catch (Exception e) {}; } if (size > 0) { s = workQ.remove(); size--; notifyAll(); } else s = null; return s; } public synchronized void finish() { done = true; notifyAll(); } }
No comments:
Post a Comment