lenti229 [~/perltest]>more regexp.pl #!/opt/local/bin/perl -w # regexp.pl # example uses of regular expressions # @protein contains test1.pep and test3.pep $protein [0] = ">A28086 Length: 104"; $protein [1] = "VKQIESKSAFQEVLDSAGDKLVVVDFSATWCGPCKMIKPFFHALSEKFNNVVFIEVDVDDCKDIAAECEVKCMPTFQFF KKGQKVGEFSGANKEKLEATINELL"; $protein [2] = ">ORF15 S. aureus pathogenicity island"; $protein [3] = "MEDVTNEEVFEMIDSRTGVLNANDWKSQLRRSATTQALKKTTTNAEIILCNDESLKGLVQYDAF EKVTKLKRLPYWRSKGDTNYYWADIDTTHVISHIDKLYNVQFSRDLIDTVIEKEAYQNRFHPIKSMIESKSWDGIKRIET LFIDYLGAEDNHYNREVTKKWMMGAVARIYQPGIKYDSMIILYGGQGVGKSTAVSKLGGHWYNQSIKTFKGDEVYKKLQG SWICEIEELSAFQKSTIEDIKGFISAIVDIYRASYGKRTERHPRQCVFVGTTNNYEFLKDQTGNRRFFPITTDKNKATKS PFDDLTPDVVQQMFAEAKVYFDEDPTDKALLLDKEASEMALKVQEAHSEKDALVGEIEEFLERPIPSDYWYRTLEEKRVS AHDVIDQDYIKLYGDGKLIELPNTKPGAYVWRDKVCSMEIWKVMMKRDDQPQQHHLRKIDKALRNTNYCDTVKKQTRYGE GIGKQYGFSVDLASYYKNLKV"; # regular expressions are found after ~= and between / / $i = 0; while (defined($protein[$i])) { if ($protein[$i] =~ /CGPC/) { print "CGPC motif found in line $i.\n"; } $i++; } #in regular expressions, ^ means start of line, $ means end of line # . matches any character # + match immediately preceding character one or more time # * match immediately preceding character zero or more times # ? match immediately preceding character zero or one times # [ ] enclose a set of characters, match any one # [^ ] enclose a set of characters, match everything except them $i = 0; while (defined($protein[$i])) { if ($protein[$i] =~ /^>/) { print "Line $i is a fasta ID line.\n"; } if ($protein[$i] =~ /NELL$/) { print "Line $i has a C-terminal NELL motif.\n"; } # simple N-glycosylation: N X S or T (sensitive, but not specific) if ($protein[$i] =~ /N.[ST]/) { print "Line $i has a simple glycosylation motif.\n"; } # PROSITE N-glycosylation: N, any residue except P, S or T, any except P if ($protein[$i] =~ /N[^P][ST][^P]/) { print "Line $i has a PROSITE glycosylation motif.\n"; } # PROSITE thioredoxin motif (high sensitivity and specificity) if ($protein[$i] =~ /[LIVMF][LIVMSTA].[LIVMFYC][FYWSTHE]..[FYWGTN]C[GATPLVE][PHYWSTA]C.{6}[LIVMFYWT] /) { print "Line $i has a PROSITE thioredoxin motif.\n"; } $i++; } exit; lenti230 [~/perltest]>perl regexp.pl CGPC motif found in line 1. Line 0 is a fasta ID line. Line 1 has a C-terminal NELL motif. Line 1 has a PROSITE thioredoxin motif. Line 2 is a fasta ID line. Line 3 has a simple glycosylation motif. Line 3 has a PROSITE glycosylation motif. lenti231 [~/perltest]>