path: root/uri.h



#ifndef uri_H
#define uri_H

//#define _XOPEN_SOURCE 500 //for strdup
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

//uri_reserved = gen-delims / sub-delims
#define pe_gen_delims ":/?#[]@"
#define pe_sub_delims "!$&'()*+,;="
//char *pe_reserved[]=pe_gen_delims "" pe_sub_delims; 
#define pe_ALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
#define pe_DIGIT "0123456789"
#define pe_HPUT "-._~"
//char *pe_unreserved[]=pe_ALPHA "" pe_DIGIT "" pe_HPUT;

unsigned char rfc3086_percent_encoding[256];

#define isxdigit(a) ((a >= 'a' && a <= 'f') || (a >= '0' && a <= '9') || (a >= 'A' && a <= 'F'))
#define toupper(a) ((a >= 'a' && a <= 'z')?a-' ':a)

char *uri_reserved={
  pe_gen_delims
  pe_sub_delims
  pe_ALPHA
  pe_DIGIT
  pe_HPUT
};

int uriescapelength(unsigned char *in,int len) {
  int rlen=0;//be sure to add one to this return value if you plan on putting a null byte at the end.
  int i;
  for(i=0;i<len;i++) {
    rlen+=strchr(uri_reserved,in[i])?1:3;
  }
  return rlen;
}

// make sure your out char * has enough space! use uriescapelength for it.
void uriescape(unsigned char *in,unsigned char *out,int len) {
  int i;
  int j;
  for(i=0,j=0;i<len;i++) {
    if(strchr(uri_reserved,in[i])) {
      out[j]=in[i];
      j++;
    } else {
      out[j]='%';
      j++;
      out[j]="0123456789ABCDEF"[((in[i] >> 4) % 16)];
      j++;
      out[j]="0123456789ABCDEF"[(in[i] % 16)];
      j++;
    }
  }
}

int uriunescape(char *in,char *out) {
 char *o=out;
 char *t;
 char a,b;
 char *s=in;
 while((t=strchr(s,'%'))) {
  if(t-s) {//if there are actually bytes to copy.
   memmove(o,s,t-s);
   o+=(t-s);
   s+=(t-s);
  }
  if(isxdigit(t[1]) && isxdigit(t[2])) {
   s+=3;//skip the %XX
   a=toupper(t[1]);
   b=toupper(t[2]);
   *o=((a-'0'<10 ? a-'0' : a-'A'+10) << 4) + (b-'0'<10 ? b-'0' : b-'A'+10);
   o++;
  } else {
   s++;//skip just the %. the next character might be a % //TODO: look up what the "right" thing to do here is.
   *o='%';
   o++;
  }
 }
 //copy the last part.
 memmove(o,s,strlen(s));
 o[strlen(s)]=0;
 return o+strlen(s)-out;
}

struct uri {//warning. it is technically undefined behavior to set one half of a union then use the other half.
  union {
    char *A[8];
    struct {
      union { char *s;char *scheme; };
      union { char *u;char *username; };
      union { char *k;char *password; };
      union { char *d;char *domain; };
      union { char *P;char *port; };
      union { char *p;char *path; };
      union { char *q;char *query_string; };
      union { char *f;char *fragment_id; };
    };
  };
};

//returns 0 on success, returns a byte with bits set for non-matching pieces.
unsigned int uricmp(struct uri *a,struct uri *b) {
  int i;
  int ret=0;
  for(i=0;i<8;i++) {
    if(a->A[i] && !b->A[i]) ret |=(1<<(i+8));//we have a's but not b's
    if(!a->A[i] && b->A[i]) ret |=(1<<(i+16));
    //for testing if(!a->A[i] && !b->A[i]) ret |=(1<<(i+24));//no problem here. both empty.
    if(a->A[i] && b->A[i]) {
      if(strcmp(a->A[i],b->A[i])) {
        ret|=(1<<i);
      }
    }
  }
  return ret;
}

char *linefromuri(struct uri *u) {
  char *line=malloc(2048);//fuck if I know
  strcpy(line,"");
  if(u->scheme) {
    strcat(line,u->scheme);
  }
  if(u->scheme && u->domain) {
    strcat(line,"://");
  }
  if(u->scheme && !u->domain) {
    strcat(line,":");
  }
  if(u->username && u->domain) {
    strcat(line,u->username);
  }
  if(u->password && u->username && u->domain) {//we /should/ only have a password if there's a username AND domain
    strcat(line,":");
    strcat(line,u->password);
  }
  if(u->username && u->domain) {
    strcat(line,"@");
  }
  if(u->domain) {
    strcat(line,u->domain);
  }
  if(u->port && u->domain) { //port only makes sense if there's a domain
    strcat(line,":");
    strcat(line,u->port);
  }
  if(u->path && u->scheme && !u->domain) {
    strcat(line,u->path);
  }
  if(u->path && u->scheme && u->domain) {
    if(*u->path != '/') {
      strcat(line,"/");
    }
    strcat(line,u->path);
    //path must start with / if we have domain.
  }
  if(u->query_string) {
    strcat(line,"?");
    strcat(line,u->query_string);
  }
  if(u->fragment_id) {
    strcat(line,"#");
    strcat(line,u->fragment_id);
  }
  return line;
}

/*
 schemes are case sensitive but cononicals are lower case.
 domain is case insensitive. return it lowercased?
 port is optional and in decimal
 path
 scheme://username:password@domain:port/path?query_string#fragment_id
 mailto:username@domain

 optional stuff:
 scheme, username, password, port, path, query_string, fragment_id
*/

//should it be a dick about what characters are allowed?
//should it just try to ignore weird shit?

//return 0 on fail //not sure what this means.
//return 1 on success
int urifromline(struct uri *u,char *line) {
  //these first two are easy. the rest... not so much.
  char *t;
//  memset(u,0,sizeof(struct uri)); //this function shouldn't do this.
  if((u->fragment_id=strchr(line,'#'))) {
    *u->fragment_id=0;
    u->fragment_id++;
  }
  if((u->query_string=strchr(line,'?'))) {
    *u->query_string=0;
    u->query_string++;
  }
  //now we have scheme, user, pass, domain, port, and path. maybe.
  //what character can we split on now? : is a terrible choice.
  // how about /? first / is either a separator between scheme
  //could find the first non-scheme character.
  //so we might have... scheme://user:pass@host:port/path
  //or... user:pass@host:port/path ?
  //we need to do this based on /s
  // we're either going to find the scheme and authority separator
  // or we're going to find the start of a path.
  //there: scheme:/path, scheme://host (empty path), or scheme:path/morepath
  //or...  should we do paths without
  //scheme must start with a-z
/*  if(*line == '/' && *(line+1) != '/') { //we have a relative path. /like:this.maybe
    u->path=line;
    return;//we're done. nothing else to do.
  }
  if(*line == '.') { //we have a relative path like: ./derp or ../merp
    u->path=line;
    return;//we're done here. nothing else to do.
  }*/
  //let's see if this starts with a scheme
  if(strchr(line,':') && ((*line >= 'a' && *line <= 'z') || (*line >= 'A' && *line <= 'Z'))) {
    for(u->scheme=strchr(line,':')-1;u->scheme > line;u->scheme--) {
      if((*u->scheme >= 'a' && *u->scheme <= 'z') ||
         (*u->scheme >= 'A' && *u->scheme <= 'Z') ||
         (*u->scheme >= '0' && *u->scheme <= '9') ||
         *u->scheme == '+' || *u->scheme == '-' || *u->scheme == '.') {
        //this is still a scheme.
      } else {
        break;
      }
    }
    if(u->scheme == line) {//we got through the for loop alright. line starts with a scheme.
      line=strchr(line,':');
      *line=0;
      line++;
      for(t=u->scheme;*t;t++) {
        if(*t >= 'A' && *t <= 'Z') *t+=' ';
      }
    }
  }

  //copy-pasted from above the scheme strip attempt.
  if(*line == '/' && *(line+1) != '/') { //we have a relative path. /like:this.maybe
    u->path=line;
    return 1;//we're done. nothing else to do.
  }
  if(*line == '.') { //we have a relative path like: ./derp or ../merp
    u->path=line;
    return 1;//we're done here. nothing else to do.
  }

  if(*line == '/' && line[1] == '/') {//we have an authority section.
    //let's left-shift this shit over until the third /
    for(t=line+1;*(t+1) && *(t+1) != '/';t++) {
      *t=*(t+1);
    }
    *t=0;
    u->path=t+1;//if there was a /, path points at it and the stuff after.
    //if there wasn't a /, it points at a null byte. so "empty"
    u->username=line+1;
  } else {
    //we're an authority section without a // I guess.
    //or we're a path
    if(u->scheme) u->path=line;
    else u->username=line;//if we have a scheme we're not a //-less authority
  }

  if(u->username) {//this contains all of the authority.
    if((u->domain=strchr(u->username,'@'))) {//we have user@host at least.
      *u->domain=0;
      u->domain++;
    } else {//this isn't really a username. it is the domain.
      u->domain=u->username;
      u->username=0;
    }
  }
  //if we still have u->username we try to split to user and password
  if(u->username) {
    if((u->password=strchr(u->username,':'))) {
      *u->password=0;
      u->password++;
    }
  }
  if(u->domain) {
    if((u->port=strchr(u->domain,']')) && *u->domain == '[') {//this is an IPv6 host
      *u->port=0;
      u->port++;
      u->domain++;//we need to skip the leading [
      if(*u->port == ':') {
        *u->port=0;
        u->port++;//if it ends up being empty, whatever. that's a URI like: http://host:/path
      }
    } else { //we're safe to split port off at :
      if((u->port=strchr(u->domain,':'))) {
        *u->port=0;
        u->port++;
      } //there isn't a port. leave it unset.
    }
  }
  //I dunno.<