#!/usr/bin/perl # # Performs a quick or deep binary scan of all SRRS_xxxx files # Writes an index of all messages with byte positions # Or can be used to extract messages by specifying the headers # # Developer: Dan Swank # # Run with no arguments to display usage information # my $VERBOSE = 0; my $VERSION = "1.2.2"; my $MOD_DATE = "2009-07-30"; my @headers = (); my $bytesread = 0; # For report # protos sub scanFile($@); sub quickScan($); sub headerStrip($); if( $ARGV[0] && !($ARGV[0] =~ m/^-/ ) && (-f $ARGV[0]) ) { &scanFile(shift(@ARGV),@ARGV); exit(0); } elsif( $ARGV[0] && !($ARGV[0] =~ m/^-/ ) && (-d $ARGV[0]) ) { my $fileCount = 0; my $fullStartTime = time; my $myDir = shift(@ARGV); $myDir =~ s/\/*\//\//g; $myDir =~ s/\/*$//g; opendir(DIR,$myDir); my @files = sort grep(/(\.(BIN|PCX|ANX)$)/,readdir(DIR)); closedir(DIR); my $targetCount = $#files+1; if( $VERBOSE ) { print STDOUT "\n$0 directory scan found $targetCount target(s) to process.\n\n"; } my $c = 0; my $procCount = 0; foreach my $f ( @files ) { if( (-f "$myDir/$f") && (-r "$myDir/$f") ) { $procCount++; print STDOUT "Processing ($procCount/$targetCount) >> $myDir/$f\n"; $c += &scanFile("${myDir}/${f}",@ARGV); $fileCount++; } } my $fullEndTime = time; my $fullTime = $fullEndTime - $fullStartTime; print "\n$0:\nProcessed $fileCount Files / $c Records - in $fullTime seconds.\n\n\n"; exit 0; } else { die("\n### srrsscan.pl : \n$ARGV[0]\n\tNo such file or directory! srrsscan.pl [OPTIONS] * If a directory is given, all non-hidden files except (*.scan) will be fed through the processing with the same [OPTIONS] for-each. OPTIONS: -v Verbose Output -SepFile Will Seperate message into individual files Names by WMO header found between messages --output-path If Multiple Output Files are created, use this tp specify the directory to place them --no-index Do not append to or create index file --no-cache -force Do not use cached .scan file, and re-create a new .scan file --type-filter Will Extract with specified TYPE (if -sepfile is ON) -wmo Does the same as -SepFile, except will only write the output files whose WMO headers match the pattern. Will Concatenate all records matched into -sepfile must be ON, for this to function If has --- appending to the end, the will not be attached to the filename, use this when regex patterns exist in the or you will end up with undesirable filenames! -stripheader (-strip) When outputting files, setting this will automatically remove both the LDM and WMO header from the output $0 v.$VERSION ($MOD_DATE) "); } sub scanFile($@) { my $infile = shift; local(@ARGV) = @_; if( !(-r $infile) ) { die("### $0 : Fatal : Input file $infile is not readable by effective user! Check existance & permissions.\n\n"); } my $indexFILE = "${infile}.scan"; my $size = (-s $infile); my $starttime = time; my $createINDEX = 1; my $SEPERATE_FILE = 0; my $TARGET_WMO = ""; my $outDIR = "DUMP"; my $outFILE = "./srrsset.out"; my $mode = "unknown"; my $string = ""; # the main data stream my $stringlen = 0; # accounts Lenght of unpacked data stream my $stripHeader = 0; my $forceMode = 0; while( @ARGV ) { $arg = shift(@ARGV); if( $VERBOSE ) { print "Parsing arg: $arg\n"; } if( $arg eq "-v" ) { $VERBOSE = 1; print "\nVerbose option on.\n\n"; } if( $arg =~ m/-[sS]ep([fF]ile)?/g ) { $SEPERATE_FILE = 1; } if( $arg eq "--output-path" ) { $outDIR = shift(@ARGV); } if( $arg eq "--no-index" ) { $createINDEX = 0; } if( $arg =~ m/-[sS]trip([hH]eader)?/ ) { $stripHeader = 1; } if( $arg =~ m/^\-(\-no\-cache|force)/ ) { $forceMode = 1; } if( $arg eq "-wmo" ) { $SEPERATE_FILE = 1; $TARGET_WMO = shift(@ARGV); $outFILE = shift(@ARGV); } if( $arg eq "--type-filter" ) { $SEPERATE_FILE = 1; $TYPE_FILTER = shift(@ARGV); } } # Check and either read or initialize index (.scan) file undef my @SCANDATA; if( $createINDEX ) { if( ((-r $indexFILE) && (-s $indexFILE)) && !($forceMode) ) { open(IDX,"<",$indexFILE); @SCANDATA = readline(IDX); close(IDX); $mode ="Cached read"; if( $VERBOSE ) { print "$0 : INDEX\n\tFound and using cached index file.\n\t$indexFILE\n\n"; } $createINDEX = 0; } else { if( $forceMode && (-e $indexFILE) ) { unlink($indexFILE); } open(INDEX,">",$indexFILE) || die("### Could not open index file.\n\n"); if( $VERBOSE ) { print " $0 : initializing index file\n\t$indexFILE .\n\n"; } } } elsif( $VERBOSE ) { print "Index creation forced OFF!\n\n"; } if( $VERBOSE && $outDIR ) { print " * Output being sent to: $outDIR .\n"; } if( $VERBOSE && $SEPERATE_FILE && ($outDIR || $outFILE) ) { print " * Files will be seperated.\nOutDIR > $outDIR\nOutFile > $outFILE\n"; } if( $VERBOSE ) { print "\n"; } # Data holders undef my @bytepos; undef @headers; undef my @RecType; # IF the data was NOT cached-in -- # First, attempt a quick, shortcut, scan # if this fails, proceed to the deep scan if( !@SCANDATA ) { @SCANDATA = &quickScan($infile); # IF this worked, yay! we can write the INDEX & skip the deep scan if( @SCANDATA ) { $mode = "Quick Scan"; print INDEX @SCANDATA; undef @headers; foreach my $scanline ( @SCANDATA ) { chomp($scanline); $scanline =~ s/\s*\s/\t/g; my ($f,$a,$LDM_HEADER,$WMO_HEADER,$fileheader) = split(/[\:\t]/,$scanline); chomp($a); chomp($LDM_HEADER); chomp($WMO_HEADER); chomp($fileheader); push(@bytepos,$a); push(@headers,$WMO_HEADER); push(@RecType,$fileheader); } goto SCANDATA_CACHED; } } # If index data exists, this loop restores it if( @SCANDATA ) { if( $VERBOSE ) { print " $0 : Found ".int($#SCANDATA+1)." scan information records from previous run\n\n"; } foreach my $scanline ( @SCANDATA ) { my @parts = split(/[\t\:]/,$scanline); $a = $parts[1]; $WMO_HEADER = $parts[3]; $fileheader = $parts[4]; chomp($a); chomp($WMO_HEADER); chomp($fileheader); if( $VERBOSE ) { print "cached >> @parts"; } push(@bytepos,$a); push(@headers,$WMO_HEADER); push(@RecType,$fileheader); } goto SCANDATA_CACHED; } # This loop performs the file-scan # quite possibly further optimizable $mode = "Deep Scan"; open(IN,"<",$infile); binmode(IN); $bytesread = read(IN,$buffer,$size); close(IN); $string = unpack("A*",$buffer); $stringlen = length($string); for( my $a = 0; $a < $stringlen - 18; $a++ ) { #print STDOUT "DE BUG: byte pos : $a\n"; $piece = substr($string,$a,18); if( $piece =~ m/^\*\*\*\*\d{6,10}\*\*\*\*/g || $piece =~ m/^\#\#\#\#\d{6,10}\#\#\#\#/g ) { my $newpiece = $piece; my $newpiece2 = $piece; # print STDOUT "DE BUG : 1 [ $newpiece ] [ $newpiece2 ] - $id_len\n\n"; $newpiece2 =~ s/\*\*\*\*\d{6,10}\*\*\*\*//g; $newpiece2 =~ s/\#\#\#\#\d{6,10}\#\#\#\#//g; # print STDOUT "DE BUG: 2 [ $newpiece ] [ $newpiece2 ] - $id_len\n\n"; if( $newpiece2 ne "" ) { $newpiece =~ s|$newpiece2||g; } $id_len = length($newpiece); $WMO_HEADER = substr($string,$a+$id_len+1,18); $WMO_HEADER =~ s| |_|g; # Get file extension -=- # Finds Alphabetical characters in first 4 chars # after the first line feed after WMO header $fileheader = substr($string,$a+$id_len+19,11); $fileheader = "${newpiece2}${fileheader}"; #print STDOUT "DE BUG: fileheader[$fileheader]\n"; $fileheader =~ s|^.*\n||g; $fileheader = substr($fileheader,0,6); $fileheader =~ s|\W||g; $fileheader =~ s|\n||g; $fileheader =~ s|\d||g; if( length($fileheader) <= 2 ) { $fileheader = ""; } # Auto-Complete partial file headers if( $fileheader eq "BUF" ) { $fileheader = "BUFR"; } if( $fileheader eq "GRI" ) { $fileheader = "GRIB"; } if( $fileheader eq "DFA" ) { $fileheader = "DFAX"; } if( length($fileheader) > 4 ) { $fileheader = substr($fileheader,0,4); } push(@bytepos,$a); push(@headers,$WMO_HEADER); push(@RecType,$fileheader); if( $VERBOSE ) { print "$infile:$a\t$newpiece\t$WMO_HEADER\t$fileheader\n"; } # print "3 [ $newpiece ] [ $newpiece2 ] - $id_len - $WMO_HEADER\n\n"; if( $createINDEX ) { print INDEX "$infile:$a\t$newpiece\t$WMO_HEADER\t$fileheader\n"; } $a += length($newpiece); } elsif( $piece !~ m/[\*\#]/g ) { $a+=7; } } push(@bytepos,$bytesread); SCANDATA_CACHED: # Record Seperator if( $SEPERATE_FILE ) { if( $VERBOSE ) { print "\n==========> SEPERATING RECORDS <================\n\n"; } open(IN,"<",$infile); binmode(IN); # Load data ~ (if it isn't) # if( !$stringlen ) # { # open(IN,"<",$infile); # binmode(IN); # $bytesread = read(IN,$buffer,$size); # close(IN); # $string = unpack("A*",$buffer); # $stringlen = length($string); # } if( !(-d $outDIR ) ) { mkdir($outDIR); } if( $TARGET_WMO ) { my $otf = "${outDIR}/$outFILE.$TARGET_WMO"; if( $outFILE =~ m/\-\-\-$/ ) { $outFILE =~ s/\-\-\-$//g; $otf = "${outDIR}/$outFILE"; } $otf =~ s/\?/\./g; $OUTFILE = $otf; if( !(-e $otf) || ( (-w $otf) && (-z $otf) ) ) { open(OUTWMO,">$otf") || die("\n### Cannot open Output File $otf\n\n"); } elsif( (-s $otf) > 0 ) { open(OUTWMO,">>$otf") || die("\n### Cannot open Output File $otf\n\n"); } else { print "### $0: Could not determine status of outfile $otf.\n\n"; } binmode(OUTWMO); } my $i = 0; $skipped_wmo_recs = 0; $skipped_type_recs = 0; foreach my $head (@headers) { if( $TARGET_WMO && !($head =~ m/$TARGET_WMO/g ) ) { $skipped_wmo_recs++; $i++; next; } my $bpos = $bytepos[$i]; my $rectype = $RecType[$i]; $rectype =~ s/\W*\W/\./g; my $byte_len = $bytepos[$i+1] - $bpos; # special case for last record if( $i == $#headers ) { $byte_len = $size - $bpos; } if( $bpos < 0 || $byte_len <= 0 ) { print STDERR "\n*** $0: WARNING negative byte length or offset! \n($byte_len\@$bpos) Terminated parse at record \# ".int($i+1)."/".int($#headers+1)."\n\n"; if( $VERBOSE ) { print STDOUT " Attempted to read $byte_len bytes \@ $bpos\n"; } return($i); } my $filedata; seek(IN,$bpos,0); read(IN,$filedata,$byte_len); if( $VERBOSE ) { print " Record $head ( $byte_len \@ $bpos )\n\t--> $OUTFILE\n"; } # filedata = substr($buffer,$bpos,$byte_len); if( $TYPE_FILTER ne "" && !($rectype =~ m/$TYPE_FILTER/g) ) { $skipped_type_recs++; $i++; next; } # Write the Record if( $TARGET_WMO eq "" ) { $OUTFILE = "$outDIR/$head.$rectype"; open(OUT,">",$OUTFILE) || die("### Outfile $OUTFILE failed to open for writting.\n\n"); binmode(OUT); if( $stripHeader ) { $filedata = headerStrip($filedata); } print OUT $filedata; close(OUT); if( $VERBOSE ) { print " Record $head ( $byte_len \@ $bpos )\n\t--> $OUTFILE\n"; } } else { if( $VERBOSE ) { print "Record ".int($i+1)." [$head] : $bpos + $byte_len -> $OUTFILE\n"; } if( $stripHeader ) { $filedata = headerStrip($filedata); } print OUTWMO $filedata; } $i++; } close(IN); } if( $TARGET_WMO ) { close(OUTWMO); } $endtime = time; $et = $endtime - $starttime; $wmo_filtered = ($#headers+1) - $skipped_wmo_recs; $type_filtered = ($#headers+1) - $skipped_type_recs; $skipped_recs = $skipped_wmo_recs + $skipped_type_recs; if( $VERBOSE ) { my $percRead = sprintf("%.3f",($bytesread / $size * 100)); print "\n ======== REPORT =============\n"; print "Read in $infile \n $bytesread / $size bytes read. $percRead \%\n"; print "Method : $mode\n\n"; print "Found: ".int($#headers+1)." Conforming Records.\n"; print " $wmo_filtered Recs. Matched WMO header. \"$TARGET_WMO\"\n"; print " $type_filtered Recs. Matched Type \"$TYPE_FILTER\" \n"; print " $skipped_recs Total Filtered Records.\n"; print "Wrote Index : $indexFILE\n"; print "Time Elapsed : $et sec.\n\n"; print "$0\n\tDone.\t".`date`."\n\n"; } if( $createINDEX ) { close(INDEX); } return(int($#headers+1)); } # ======================================================================= # Scan via. LDM record indicators # returns array of record information or empty "" # if the verification failed. sub quickScan($) { $headline = "$0:quickScan : "; my $filePath = shift; open(IN,"<",$filePath); binmode(IN); my $loopBreaker = 0; my $stop = 0; my $bytesToScan = 100; my $seekPos = 0; my @returnData = (); # open(LOG,">","/dev/shm/srrsscan.$$.log"); my $loopCount = 0; while ( $stop == 0 && $loopBreaker < 1000000 ) { $loopCount++; $loopBreaker++; my $startSeekPos = $seekPos; # remember the seek pos at start seek(IN,$seekPos,0); # print STDOUT "DE BUG: $filePath \@b $seekPos\n"; my $ibytesread = read(IN,$buffer,$bytesToScan); # print LOG "\n\n${loopBreaker}[[$buffer]]\n"; # END of file, or I/O error, so get out of here if( $ibytesread != $bytesToScan ) { #print STDOUT "DE BUG STOPPED :: $ibytesread != $bytesToScan\n"; $stop++; next; } $bytesread += $ibytesread; # print STDOUT "[$buffer]\n\n"; my @ldmHeader = $buffer =~ m/([\*\#]{4})(\d{3})(\d{3,7})([\*\#]{4}[\n\r])/g; my $completeHeader = "${ldmHeader[0]}${ldmHeader[1]}${ldmHeader[2]}${ldmHeader[3]}"; my $posTest = index($buffer,$completeHeader); if( $posTest > 0 ) { # jump to the ldm header and start over. if( $VERBOSE ) { print STDOUT " *!* Clipped $posTest bytes at beginning of buffer\n"; } $seekPos += $posTest;; next; } my $ldmHeaderLength = length($completeHeader); my $ldmHeaderLengthCheck = int($ldmHeader[1]); my $ldmHeaderMsgLen = int($ldmHeader[2]); # in **** form, we need to make an assumptions. if( ($ldmHeader[0] eq "****") && ($ldmHeaderLengthCheck == 0) ) { $ldmHeaderLengthCheck = $ldmHeaderLength; $ldmHeaderMsgLen = int("${ldmHeader[1]}${ldmHeader[2]}"); } # @chrData = unpack("C*",$buffer); # print STDOUT "DE BUG ldmHeader [[@ldmHeader]] length of $ldmHeaderMsgLen b\n"; # print LOG "{{@chrData}}"; # Failed to find a LDM header, continue seek until one is found or until EoF if( $#ldmHeader == -1 ) { $seekPos += int($bytesToScan/2); next; } if( $ldmHeaderLength != $ldmHeaderLengthCheck ) { $seekPos += int($bytesToScan/2); next; } # use the internal ldm header length to seek.and read msg $seekPos += $ldmHeaderLengthCheck; seek(IN,$seekPos,0); my $readCheck = read(IN,$ldmMessage,$ldmHeaderMsgLen ); if( $readCheck != $ldmHeaderMsgLen ) { print STDERR "Read error \@ byte $seekPos\n"; close(IN); return(""); } $seekPos += $ldmHeaderMsgLen; my @fullHeader = $buffer =~ m/([\*\#]{4}\d{6,10}[\*\#]{4})[\n\r]+([^\r\n]*)[\n\r]+(......)/gs; my $wmoHeader = substr($ldmMessage,0,18); # occasionally, three may be a few characters trailing the wmo header, deal with it.a my @trailing = $ldmMessage =~ m/($wmoHeader)([ \w]*)?[\r\n]/; my $wmoHeaderExten = 0; if( defined($trailing[1]) ) { $wmoHeaderExten += length($trailing[1]); } # print STDOUT "DE BUG : trailing [$trailing[1]] x $wmoHeaderExten \n"; my $msgHead = substr($ldmMessage,18+$wmoHeaderExten,8); $msgHead =~ s/[\n\r\s]//g; # print STDOUT "DE BUG: wmoHeader ($wmoHeader) msgHead ($msgHead) \n"; $wmoHeader =~ s/\s/\_/g; $wmoHeader =~ s/_*$//g; push(@headers,$wmoHeader); # print LOG " fullHeader = [[".join('|',@fullHeader)."]]\n"; # print "DE BUG : ($fullHeader[0]) ($fullHeader[1]) ($buffer)\n"; undef my @formatID; @formatID = $msgHead =~ m/(GRIB|BUFR|DFAX|PNG|GIF|IMAG|\*\*\*\*|\#\#\#\#)/g; # Check for truncated zero-byte record # if there is another ldm header tag within the body of this record == big issue. if( $formatID[0] eq "\*\*\*\*" || $formatID[0] eq "\#\#\#\#" ) { my @fullHeader2 = $buffer =~ m/([\*\#]{4}\d{6,10}[\*\#]{4}[\n\r]+[^\r\n]*[\n\r]+)(......)/gs; $headLen = length($fullHeader2[0]); $loopCount--; $seekPos += $headLen; undef $buffer; if( $VERBOSE ) { print STDERR " ! $headline WARNING Skipping Probable truncated record \# $loopCount (>> $headLen bytes)\n"; } next; } # "redbook graphics" have a funky header # this translates it before non-alphanumerics are removed # Identify redbook graphics.... ...... if( ($#formatID == -1) ) { @bytes = unpack("C6",$fullHeader[2]); if( ($bytes[0] == 64) && ($bytes[1] == 16) && ($bytes[2] == 1) && ($bytes[3] == 1) ) { $formatID[0] = "RDBK"; } else { print "Rec:$loopCount UnRecognised file format: $wmoHeader [@bytes] [[@fullHeader]]\n"; } } $formatID[0] =~ s/\W//g; chomp($formatID[0]); if( $VERBOSE ) { print STDOUT " --> $wmoHeader Format: $formatID[0] \n"; } if( $formatID[0] eq "GRIB" ) { my $newMsgHead = substr($ldmMessage,18+$wmoHeaderExten,20); my $grbSectZero = substr( $newMsgHead , index($newMsgHead,"GRIB") ,8); my @grbSectZeroData = unpack("C*",$grbSectZero); $formatID[0] = "${formatID[0]}${grbSectZeroData[7]}"; #print "DE BUG : TEST! [$msgHead] < @grbSectZeroData > \n"; } # my @tmp = $ldmHeader[0] =~ m/(\d{6,10})/g; my $recordLength = "${ldmHeader[1]}${ldmHeader[2]}"; # if( $recordLength == 0 ) # { close(IN); return(undef); } # Remember the record information push(@returnData,"${filePath}\:${startSeekPos}\t${ldmHeader[1]}${ldmHeader[2]}\t${wmoHeader}\t$formatID[0]\n"); # $seekPos += ($ldmHeaderLength + $recordLength + 1 ); if( $VERBOSE ) { print "$returnData[$#returnData]\n"; } } if( $VERBOSE ) { print STDOUT " exiting subroutine quickScana with ($#returnData) \n"; } close(IN); # close(LOG); return(@returnData); } # removes the ldm and wmo header from a buffer string # returns the trimmed buffer sub headerStrip($) { my $theBuffer = shift; # my $in = length($theBuffer); $theBuffer =~ s/[\*\#]{4}\d{6,10}[\*\#]{4}[\r\n]+[^\r\n]*[\r\n]+//s; # my $out = length($theBuffer); # my $removed = $in - $out; return($theBuffer); }