Add em and en dashes to the start set. Fixes #11490
2 * The AutoHyperlinks Framework is the legal property of its developers (DEVELOPERS), whose names are listed in the
3 * copyright file included with this source distribution.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of the AutoHyperlinks Framework nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY ITS DEVELOPERS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ITS DEVELOPERS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #import "AHHyperlinkScanner.h"
29 #import "AHLinkLexer.h"
30 #import "AHMarkedHyperlink.h"
32 #define DEFAULT_URL_SCHEME @"http://"
33 #define ENC_INDEX_KEY @"encIndex"
34 #define ENC_CHAR_KEY @"encChar"
36 @interface AHHyperlinkScanner (PRIVATE)
37 - (NSRange)_longestBalancedEnclosureInRange:(NSRange)inRange;
38 - (BOOL)_scanString:(NSString *)inString upToCharactersFromSet:(NSCharacterSet *)inCharSet intoRange:(NSRange *)outRangeRef fromIndex:(unsigned long *)idx;
39 - (BOOL)_scanString:(NSString *)inString charactersFromSet:(NSCharacterSet *)inCharSet intoRange:(NSRange *)outRangeRef fromIndex:(unsigned long *)idx;
42 @implementation AHHyperlinkScanner
43 #pragma mark static variables
44 static NSCharacterSet *skipSet = nil;
45 static NSCharacterSet *endSet = nil;
46 static NSCharacterSet *startSet = nil;
47 static NSCharacterSet *puncSet = nil;
48 static NSCharacterSet *hostnameComponentSeparatorSet = nil;
49 static NSArray *enclosureStartArray = nil;
50 static NSCharacterSet *enclosureSet = nil;
51 static NSArray *enclosureStopArray = nil;
52 static NSArray *encKeys = nil;
54 #pragma mark Class Methods
55 + (id)hyperlinkScannerWithString:(NSString *)inString
57 return [[[[self class] alloc] initWithString:inString usingStrictChecking:NO] autorelease];
60 + (id)strictHyperlinkScannerWithString:(NSString *)inString
62 return [[[[self class] alloc] initWithString:inString usingStrictChecking:YES] autorelease];
65 + (id)hyperlinkScannerWithAttributedString:(NSAttributedString *)inString
67 return [[[[self class] alloc] initWithAttributedString:inString usingStrictChecking:NO] autorelease];
70 + (id)strictHyperlinkScannerWithAttributedString:(NSAttributedString *)inString
72 return [[[[self class] alloc] initWithAttributedString:inString usingStrictChecking:NO] autorelease];
75 #pragma mark Initialization
78 if ((self == [AHHyperlinkScanner class])) {
80 NSMutableCharacterSet *mutableSkipSet = [[NSMutableCharacterSet alloc] init];
81 [mutableSkipSet formUnionWithCharacterSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
82 [mutableSkipSet formUnionWithCharacterSet:[NSCharacterSet illegalCharacterSet]];
83 [mutableSkipSet formUnionWithCharacterSet:[NSCharacterSet controlCharacterSet]];
84 [mutableSkipSet formUnionWithCharacterSet:[NSCharacterSet characterSetWithCharactersInString:@"<>"]];
85 skipSet = [[NSCharacterSet characterSetWithBitmapRepresentation:[mutableSkipSet bitmapRepresentation]] retain];
86 [mutableSkipSet release];
90 endSet = [[NSCharacterSet characterSetWithCharactersInString:@"\"',:;>)]}.?!@"] retain];
94 NSMutableCharacterSet *mutableStartSet = [[NSMutableCharacterSet alloc] init];
95 [mutableStartSet formUnionWithCharacterSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
96 [mutableStartSet formUnionWithCharacterSet:[NSCharacterSet characterSetWithCharactersInString:[NSString stringWithFormat:@"\"'.,:;<?!-@%C%C", 0x2014, 0x2013]]];
97 startSet = [[NSCharacterSet characterSetWithBitmapRepresentation:[mutableStartSet bitmapRepresentation]] retain];
98 [mutableStartSet release];
102 puncSet = [[NSCharacterSet characterSetWithCharactersInString:@"\"'.,:;<?!"] retain];
105 if (!hostnameComponentSeparatorSet) {
106 hostnameComponentSeparatorSet = [[NSCharacterSet characterSetWithCharactersInString:@"./"] retain];
109 if(!enclosureStartArray){
110 enclosureStartArray = [[NSArray arrayWithObjects:@"(",@"[",@"{",nil] retain];
114 enclosureSet = [[NSCharacterSet characterSetWithCharactersInString:@"()[]{}"] retain];
117 if(!enclosureStopArray){
118 enclosureStopArray = [[NSArray arrayWithObjects:@")",@"]",@"}",nil] retain];
122 encKeys = [[NSArray arrayWithObjects:ENC_INDEX_KEY, ENC_CHAR_KEY, nil] retain];
127 #pragma mark Init/Dealloc
130 - (id)initWithString:(NSString *)inString usingStrictChecking:(BOOL)flag
132 if((self = [self init])){
133 m_scanString = [inString retain];
134 m_scanAttrString = nil;
135 m_urlSchemes = [[NSDictionary alloc] initWithObjectsAndKeys:
138 m_strictChecking = flag;
140 m_scanStringLength = [m_scanString length];
145 - (id)initWithAttributedString:(NSAttributedString *)inString usingStrictChecking:(BOOL)flag
147 if((self = [self init])){
148 m_scanString = [[inString string] retain];
149 m_scanAttrString = [inString retain];
150 m_urlSchemes = [[NSDictionary alloc] initWithObjectsAndKeys:
153 m_strictChecking = flag;
155 m_scanStringLength = [m_scanString length];
162 [m_scanString release];
163 [m_urlSchemes release];
164 if(m_scanAttrString) [m_scanAttrString release];
168 #pragma mark URI Verification
172 return [AHHyperlinkScanner isStringValidURI:m_scanString usingStrict:m_strictChecking fromIndex:nil withStatus:nil];
175 + (BOOL)isStringValidURI:(NSString *)inString usingStrict:(BOOL)useStrictChecking fromIndex:(unsigned long *)index withStatus:(AH_URI_VERIFICATION_STATUS *)validStatus
177 AH_BUFFER_STATE buf; // buffer for flex to scan from
178 yyscan_t scanner; // pointer to the flex scanner opaque type
179 const char *inStringEnc;
180 unsigned long encodedLength;
183 AH_URI_VERIFICATION_STATUS newStatus = AH_URL_INVALID;
184 validStatus = &newStatus;
187 *validStatus = AH_URL_INVALID; // assume the URL is invalid
189 // Find the fastest 8-bit wide encoding possible for the c string
190 NSStringEncoding stringEnc = [inString fastestEncoding];
191 if([@" " lengthOfBytesUsingEncoding:stringEnc] > 1U)
192 stringEnc = NSUTF8StringEncoding;
194 if (!(inStringEnc = [inString cStringUsingEncoding:stringEnc])) {
199 encodedLength = strlen(inStringEnc); // length of the string in utf-8
201 // initialize the buffer (flex automatically switches to the buffer in this function)
202 AHlex_init(&scanner);
203 buf = AH_scan_string(inStringEnc, scanner);
205 // call flex to parse the input
206 *validStatus = AHlex(scanner);
207 if(index) *index += AHget_leng(scanner);
209 // condition for valid URI's
210 if(*validStatus == AH_URL_VALID || *validStatus == AH_MAILTO_VALID || *validStatus == AH_FILE_VALID){
211 AH_delete_buffer(buf, scanner); //remove the buffer from flex.
212 buf = NULL; //null the buffer pointer for safty's sake.
214 // check that the whole string was matched by flex.
215 // this prevents silly things like "blah...com" from being seen as links
216 if(AHget_leng(scanner) == encodedLength){
217 AHlex_destroy(scanner);
220 // condition for degenerate URL's (A.K.A. URI's sans specifiers), requres strict checking to be NO.
221 }else if((*validStatus == AH_URL_DEGENERATE || *validStatus == AH_MAILTO_DEGENERATE) && !useStrictChecking){
222 AH_delete_buffer(buf, scanner);
224 if(AHget_leng(scanner) == encodedLength){
225 AHlex_destroy(scanner);
228 // if it ain't vaild, and it ain't degenerate, then it's invalid.
230 AH_delete_buffer(buf, scanner);
232 AHlex_destroy(scanner);
235 // default case, if the range checking above fails.
236 AHlex_destroy(scanner);
240 #pragma mark Accessors
242 - (AHMarkedHyperlink *)nextURI
244 NSRange scannedRange;
245 unsigned long scannedLocation = m_scanLocation;
247 // scan upto the next whitespace char so that we don't unnecessarity confuse flex
248 // otherwise we end up validating urls that look like this "http://www.adiumx.com/ <--cool"
249 [self _scanString:m_scanString charactersFromSet:startSet intoRange:nil fromIndex:&scannedLocation];
251 // main scanning loop
252 while([self _scanString:m_scanString upToCharactersFromSet:skipSet intoRange:&scannedRange fromIndex:&scannedLocation]) {
253 BOOL foundUnpairedEnclosureCharacter = NO;
255 // Check for and filter enclosures. We can't add (, [, etc. to the skipSet as they may be in a URI
256 if([enclosureSet characterIsMember:[m_scanString characterAtIndex:scannedRange.location]]){
257 unsigned long encIdx = [enclosureStartArray indexOfObject:[m_scanString substringWithRange:NSMakeRange(scannedRange.location, 1)]];
259 if(NSNotFound != encIdx) {
260 encRange = [m_scanString rangeOfString:[enclosureStopArray objectAtIndex:encIdx] options:NSBackwardsSearch range:scannedRange];
261 if(NSNotFound != encRange.location){
262 scannedRange.location++; scannedRange.length -= 2;
264 foundUnpairedEnclosureCharacter = YES;
268 if(!scannedRange.length) break;
270 // Find balanced enclosure chars
271 NSRange longestEnclosure = [self _longestBalancedEnclosureInRange:scannedRange];
272 while (scannedRange.length > 2 && [endSet characterIsMember:[m_scanString characterAtIndex:(scannedRange.location + scannedRange.length - 1)]]) {
273 if((longestEnclosure.location + longestEnclosure.length) < scannedRange.length){
274 scannedRange.length--;
275 foundUnpairedEnclosureCharacter = NO;
279 // if we have a valid URL then save the scanned string, and make a SHMarkedHyperlink out of it.
280 // this way, we can preserve things like the matched string (to be converted to a NSURL),
281 // parent string, its validation status (valid, file, degenerate, etc), and its range in the parent string
282 AH_URI_VERIFICATION_STATUS validStatus;
283 NSString *_scanString = nil;
284 if(3 < scannedRange.length) _scanString = [m_scanString substringWithRange:scannedRange];
286 if((3 < scannedRange.length) && [[self class] isStringValidURI:_scanString usingStrict:m_strictChecking fromIndex:&m_scanLocation withStatus:&validStatus]){
287 AHMarkedHyperlink *markedLink;
289 //insert typical specifiers if the URL is degenerate
291 case AH_URL_DEGENERATE:
293 NSString *scheme = DEFAULT_URL_SCHEME;
296 NSRange firstComponent;
297 [self _scanString:_scanString
298 upToCharactersFromSet:hostnameComponentSeparatorSet
299 intoRange:&firstComponent
302 if(NSNotFound != firstComponent.location) {
303 NSString *hostnameScheme = [m_urlSchemes objectForKey:[_scanString substringWithRange:firstComponent]];
304 if(hostnameScheme) scheme = hostnameScheme;
307 _scanString = [scheme stringByAppendingString:_scanString];
312 case AH_MAILTO_DEGENERATE:
313 _scanString = [@"mailto:" stringByAppendingString:_scanString];
320 markedLink = [[[AHMarkedHyperlink alloc] initWithString:_scanString
321 withValidationStatus:validStatus
322 parentString:m_scanString
323 andRange:scannedRange] autorelease];
324 return [markedLink URL]? markedLink : nil;
327 //step location after scanning a string
328 if (foundUnpairedEnclosureCharacter){
331 NSRange startRange = [m_scanString rangeOfCharacterFromSet:puncSet options:NSLiteralSearch range:scannedRange];
332 if (startRange.location != NSNotFound)
333 m_scanLocation = startRange.location + startRange.length;
335 m_scanLocation += scannedRange.length;
338 scannedLocation = m_scanLocation;
341 // if we're here, then NSScanner hit the end of the string
342 // set AHStringOffset to the string length here so we avoid potential infinite looping with many trailing spaces.
343 m_scanLocation = m_scanStringLength;
349 NSMutableArray *rangeArray = [NSMutableArray array];
350 AHMarkedHyperlink *markedLink;
351 unsigned long _holdOffset = m_scanLocation; // store location for later restoration;
352 m_scanLocation = 0; //set the offset to 0.
354 //build an array of marked links.
355 while((markedLink = [self nextURI])){
356 [rangeArray addObject:markedLink];
358 m_scanLocation = _holdOffset; // reset scanLocation
362 -(NSAttributedString *)linkifiedString
364 NSMutableAttributedString *linkifiedString;
365 AHMarkedHyperlink *markedLink;
366 BOOL _didFindLinks = NO;
367 unsigned long _holdOffset = m_scanLocation; // store location for later restoration;
371 if(m_scanAttrString) {
372 linkifiedString = [[m_scanAttrString mutableCopy] autorelease];
374 linkifiedString = [[[NSMutableAttributedString alloc] initWithString:m_scanString] autorelease];
377 //for each SHMarkedHyperlink, add the proper URL to the proper range in the string.
378 while((markedLink = [self nextURI])) {
379 NSURL *markedLinkURL;
381 if((markedLinkURL = [markedLink URL])){
382 [linkifiedString addAttribute:NSLinkAttributeName
384 range:[markedLink range]];
388 m_scanLocation = _holdOffset; // reset scanLocation
390 return _didFindLinks? linkifiedString :
391 m_scanAttrString ? [[m_scanAttrString retain] autorelease] : [[[NSMutableAttributedString alloc] initWithString:m_scanString] autorelease];
394 -(unsigned long)scanLocation
396 return m_scanLocation;
399 - (void)setScanLocation:(unsigned int)location
401 m_scanLocation = location;
404 #pragma mark Below Here There Be Private Methods
406 - (NSRange)_longestBalancedEnclosureInRange:(NSRange)inRange
408 NSMutableArray *enclosureStack = nil, *enclosureArray = nil;
409 NSString *matchChar = nil;
410 NSDictionary *encDict;
411 unsigned long encScanLocation = inRange.location;
413 while(encScanLocation < inRange.length + inRange.location) {
414 [self _scanString:m_scanString upToCharactersFromSet:enclosureSet intoRange:nil fromIndex:&encScanLocation];
416 if(encScanLocation >= (inRange.location + inRange.length)) break;
418 matchChar = [m_scanString substringWithRange:NSMakeRange(encScanLocation, 1)];
420 if([enclosureStartArray containsObject:matchChar]) {
421 encDict = [NSDictionary dictionaryWithObjects:[NSArray arrayWithObjects:[NSNumber numberWithUnsignedLong:encScanLocation], matchChar, nil]
423 if(!enclosureStack) enclosureStack = [NSMutableArray arrayWithCapacity:1];
424 [enclosureStack addObject:encDict];
425 }else if([enclosureStopArray containsObject:matchChar]) {
426 NSEnumerator *encEnumerator = [enclosureStack objectEnumerator];
427 while ((encDict = [encEnumerator nextObject])) {
428 unsigned long encTagIndex = [(NSNumber *)[encDict objectForKey:ENC_INDEX_KEY] unsignedLongValue];
429 unsigned long encStartIndex = [enclosureStartArray indexOfObjectIdenticalTo:[encDict objectForKey:ENC_CHAR_KEY]];
430 if([enclosureStopArray indexOfObjectIdenticalTo:matchChar] == encStartIndex) {
431 NSRange encRange = NSMakeRange(encTagIndex, encScanLocation - encTagIndex + 1);
432 if(!enclosureStack) enclosureStack = [NSMutableArray arrayWithCapacity:1];
433 if(!enclosureArray) enclosureArray = [NSMutableArray arrayWithCapacity:1];
434 [enclosureStack removeObject:encDict];
435 [enclosureArray addObject:NSStringFromRange(encRange)];
440 if(encScanLocation < inRange.length + inRange.location)
443 return (enclosureArray && [enclosureArray count])? NSRangeFromString([enclosureArray lastObject]) : NSMakeRange(0, 0);
446 // functional replacement for -[NSScanner scanUpToCharactersFromSet:intoString:]
447 - (BOOL)_scanString:(NSString *)inString upToCharactersFromSet:(NSCharacterSet *)inCharSet intoRange:(NSRange *)outRangeRef fromIndex:(unsigned long *)idx
451 unsigned long _scanLength = [inString length];
454 if(_scanLength <= *idx) return NO;
457 for(_idx = *idx; _scanLength > _idx; _idx++) {
458 _curChar = [inString characterAtIndex:_idx];
459 if(![skipSet characterIsMember:_curChar]) break;
463 for(*idx = _idx; _scanLength > _idx; _idx++) {
464 _curChar = [inString characterAtIndex:_idx];
465 if([inCharSet characterIsMember:_curChar] || [skipSet characterIsMember:_curChar]) break;
468 _outRange = NSMakeRange(*idx, _idx - *idx);
471 if(_outRange.length) {
472 if(outRangeRef) *outRangeRef = _outRange;
479 // functional replacement for -[NSScanner scanCharactersFromSet:intoString:]
480 - (BOOL)_scanString:(NSString *)inString charactersFromSet:(NSCharacterSet *)inCharSet intoRange:(NSRange *)outRangeRef fromIndex:(unsigned long *)idx
484 unsigned long _scanLength = [inString length];
485 unsigned long _idx = *idx;
487 if(_scanLength <= _idx) return NO;
490 for(_idx = *idx; _scanLength > _idx; _idx++) {
491 _curChar = [inString characterAtIndex:_idx];
492 if(![skipSet characterIsMember:_curChar]) break;
496 for(*idx = _idx; _scanLength > _idx; _idx++) {
497 _curChar = [inString characterAtIndex:_idx];
498 if(![inCharSet characterIsMember:_curChar]) break;
501 _outRange = NSMakeRange(*idx, _idx - *idx);
504 if(_outRange.length) {
505 if(outRangeRef) *outRangeRef = _outRange;