// skein hash function, 512 bit variant
// pesco, 2009

#include <stdint.h>
#include <string.h>  // memcpy, memset
#include <stdio.h>

#ifdef BIGENDIAN
void ltoh64(uint64_t *p, int n);
void htol64(uint64_t *p, int n);
# error "TODO: implement htol64 and ltoh64"
#else
# define htol64(p,n) // no-op
# define ltoh64(p,n) // no-op
#endif

static const int Nw = 8;     // block length in words
static const int Nb = 64;    // block length in bytes


// ==- threefish -==

uint64_t inline rotl(uint64_t x, int n)
{
	return (x<<n | x>>(64-n));
}

void inline mix(int n, const int *r, int d, int j, const uint64_t *x, uint64_t *y)
{
	y[0] = x[0] + x[1];
	y[1] = y[0] ^ rotl(x[1], r[(d%8)*(n/2) + j]);
}

void inline
add_subkey(int n, int d, const uint64_t *k, const uint64_t *t, uint64_t *v)
{
	int s = d/4;
	int i;

	for(i=0; i<n; i++)
		v[i] += k[(s+i) % (n+1)];
	v[5] += t[s%3];
	v[6] += t[(s+1)%3];
	v[7] += s;
}

// assumes space for 1 extra word after k and t for extensions
// Nw = block size in words
// Nr = number of rounds
// r  = rotation table, dimensions: 8*(Nw/2)
// pi = permutation, length: Nw
void threefish(int Nw, int Nr, const int *r, const int *pi,
               uint64_t *k, uint64_t *t, const uint64_t *p, uint64_t *v)
{
	uint64_t f[Nw];    // intermediate state after mix step (before permutation)
	int d;             // round counter
	int i;

	// extend key and tweak with parity words
	k[Nw] = 0x5555555555555555LL;  // floor(2^64 / 3)
	for(i=0; i<Nw; i++)
		k[Nw] ^= k[i];
        t[2] = t[0] ^ t[1];

	// start with state = plaintext
	for(i=0; i<Nw; i++)
		v[i] = p[i];

	for(d=0; d<Nr; d++) {
		// every 4 rounds, add current subkey
		if(d%4 == 0)
			add_subkey(Nw, d, k, t, v);

		// apply mix functions
		for(i=0; i<Nw; i+=2)
			mix(Nw, r, d, i/2, v+i, f+i);

		// permute
		for(i=0; i<Nw; i++)
			v[i] = f[pi[i]];
	}

	// add one last subkey
	add_subkey(Nw, d, k, t, v);
}

// assumes space for 1 extra word after k and t for extensions
void threefish512(uint64_t *k, uint64_t *t, const uint64_t *p, uint64_t *v)
{
	// rotation table for the mix functions
	static const int r[8*4] =
		{46,36,19,37, 33,27,14,42, 17,49,36,39, 44, 9,54,56,
		 39,30,34,24, 13,50,10,17, 25,29,39,43,  8,35,56,22};

	// permutation
	static const int pi[] = {2,1,4,7,6,5,0,3};

	threefish(8, 72, r, pi, k, t, p, v);
}


// ==- ubi -==

#define UBI_TYPE_KEY 0   // key (for MAC and KDF)
#define UBI_TYPE_CFG 4   // configuration block
#define UBI_TYPE_PRS 8   // personalization
#define UBI_TYPE_PK  12  // public key (for signature hashing)
#define UBI_TYPE_KDF 16  // key identifier (for KDF)
#define UBI_TYPE_NON 20  // nonce (for stream cipher or randomized hashing)
#define UBI_TYPE_MSG 48  // message
#define UBI_TYPE_OUT 63  // output

// "raw" ubi worker routine (see below)
void ubi_(uint64_t *g, uint64_t *q, uint64_t* t,
	uint64_t len, uint64_t final, const uint64_t *p);

// ubi for in-memory messages
// assumes space for 1 extra word after g
// only supports 64-bit message lengths
void ubi(uint64_t *g, uint64_t *g_, uint64_t type, uint64_t len, uint8_t *m)
{
	uint64_t r = len%Nb;
	uint64_t *p = (uint64_t *)m;
	uint64_t q[Nw];
	uint64_t t[3];

	// initialize
	t[0] = 0LL;           // position field
	t[1] = type << 56;    // type field at bit 120
	t[1] |= 1LL << 62;    // set "First" flag

	// process block-sized input without padding
	len -= r;
	ltoh64(p, len/8);
	ubi_(g, g_, t, len, !r, p);
	htol64(p, len/8);

	// pad and process final block, if necessary
	if(r) {
		memcpy(q, m+len, r);
		memset(((void *)q)+r, 0, Nb-r);
		ltoh64(q, Nw);
		ubi_(g, g_, t, r, 1, q);
	}
}

// assumes space for 1 extra word after 'g'
// assumes space for 1 extra word after 't'
// only supports 64-bit message lengths
// assumes padding and byteorder conversion already done on 'p'
// 'len' is unpadded input length in bytes
// 'final' flag (0/1) specifies wether 'p' ends with the final message block
void ubi_(uint64_t *g, uint64_t *g_, uint64_t* t,
	uint64_t len, uint64_t final, const uint64_t *p)
{
	static const uint64_t notfirst = ~(1LL<<62);

	uint64_t c[Nw];              // threefish output
	int nin;                     // number of input bytes for current block
	int i;

	nin = Nb;
	while(len) {
		// on last block
		if(len<=Nb) {
			nin = len;
			t[1] |= final<<63;  // set "Final" flag if applicable
		}

		// process block
		t[0] += nin;                // update position counter in tweak
		threefish512(g, t, p, c);
		for(i=0; i<Nw; i++)
			g_[i] = p[i] ^ c[i];

		t[1] &= notfirst;           // clear "First" flag
		len -= nin;
		p++;
	}
}


// ==- skein -==

// assumes that output buffer size is a multiple of the block size
void output(uint64_t *g, uint64_t No, uint8_t *o)
{
	uint64_t i;
	uint64_t h[Nw];

	// pump full blocks of output directly into the buffer
	for(i=0; No>8*Nb; i++, No-=8*Nb) {
		htol64(&i,1);
		ubi(g, ((uint64_t *)o)+i, UBI_TYPE_OUT, 8, (uint8_t *)&i);
		ltoh64(&i,1);
	}
	htol64((uint64_t *)o, i);
	o += Nb*i;
	if(No==0)
		return;

	// generate one more block of output
	htol64(&i,1);
	ubi(g, h, UBI_TYPE_OUT, 8, (uint8_t *)&i);
	ltoh64(&i,1);

	// full bytes of output
	htol64(h, Nw);
	memcpy(o, h, No%8 ? No/8+1 : No/8);
	o += No/8;
	No = No%8;
	if(No==0)
		return;

	// shift last bits of output to the big end
	*o <<= 8-No;
}

void skein(uint64_t Nm, uint8_t *m, uint64_t No, uint8_t *o)
{
	uint64_t g[Nw+1];
	uint8_t cfg[32];   // configuration string

	memset(g, 0, Nb);

	// ubi CFG
	memset(cfg, 0, 32);
	memcpy(cfg, "SHA3", 4);  // schema identifier
	cfg[4] = 1; cfg[5] = 0;  // version number = 1
	memcpy(cfg+8, &No, 8);   // output length
	htol64((uint64_t *)(cfg+8), 1);
	ubi(g, g, UBI_TYPE_CFG, 32, cfg);

	// ubi MSG
	ubi(g, g, UBI_TYPE_MSG, Nm, m);

	output(g, No, o);
}


// ==- main -==

int main(int argc, char **argv)
{
	int i;
	uint8_t u;
	uint8_t hash[64];

	u = 0xFF;
	skein(1, &u, 512, hash);
	for(i=0; i<64; i++)
		printf("%.2x", hash[i]);
	putchar('\n');

	return 0;
}
