Why?
We want to scramble sensitive data that we send to suppliers for support or analysis. With inspiration from: "Obfuscating your SQL Server Data" by John Magnabosco but tweaked for our purposes. NOTE that the following has only ever been run on development environments and I would not recommend running this on a production system as I have not tested the performance and database load.
Before: StudentID StudentName DateOfBirth ----------- -------------------- ------------- 1 John Smith 1990-03-21 2 Fred Bloggs 1988-11-02 3 Another User 1985-07-11 4 Yet Another User 1977-06-25 After: StudentID StudentName DateOfBirth ----------- -------------------- ------------- 1 mStnoh iJh 1926-01-02 2 lgreg BFdos 1969-03-14 3 onrUest ehAr 1968-05-05 4 otentre AhYe Usr 1964-08-08
- Before:
- StudentID StudentName DateOfBirth
- ----------- -------------------- -------------
- 1 John Smith 1990-03-21
- 2 Fred Bloggs 1988-11-02
- 3 Another User 1985-07-11
- 4 Yet Another User 1977-06-25
- After:
- StudentID StudentName DateOfBirth
- ----------- -------------------- -------------
- 1 mStnoh iJh 1926-01-02
- 2 lgreg BFdos 1969-03-14
- 3 onrUest ehAr 1968-05-05
- 4 otentre AhYe Usr 1964-08-08
Precursor
You will need to be able to create the following view to generate random numbers on SQL Server.
-- Used to reference RAND with in a function CREATE VIEW dbo.vwRandom AS SELECT RAND() as RandomValue; GO
- -- Used to reference RAND with in a function
- CREATE VIEW dbo.vwRandom
- AS
- SELECT RAND() as RandomValue;
- GO
The Function
Again you need permission to create this function. Don't forget to GRANT permission to execute this function for whatever user/system account that will run it.
-- Drop the function if it already exists IF OBJECT_ID ('ufn_DataJumble', 'FN') IS NOT NULL DROP FUNCTION ufn_DataJumble ; GO -- Create user defined function CREATE FUNCTION ufn_DataJumble ( @OrigVal varchar(max) ) RETURNS varchar(max) WITH ENCRYPTION AS BEGIN -- Variables used DECLARE @NewVal varchar(max); DECLARE @OrigLen int; DECLARE @CurrLen int; DECLARE @LoopCt int; DECLARE @Rand int; -- for dates/times DECLARE @MinYear int; DECLARE @MaxYear int; DECLARE @RandYear int; DECLARE @RandMonth int; DECLARE @RandDay int; DECLARE @RandHour int; DECLARE @RandMinute int; DECLARE @RandSeconds int; DECLARE @RandMSeconds int; DECLARE @DateVal date; DECLARE @TimeVal time; DECLARE @DateMatch int; DECLARE @TimeMatch int; DECLARE @ColumnTypeIsDate int; -- Format given value (prevents CHAR datatype issue) SET @OrigVal = LTRIM(RTRIM(@OrigVal)); -- Set variable default values SET @NewVal = ''; SET @OrigLen = DATALENGTH(@OrigVal); SET @CurrLen = @OrigLen; SET @LoopCt = 1; SET @DateVal = ''; -- 1900-01-01 SET @TimeVal = ''; -- 00:00:00.000 SET @DateMatch = 0; SET @TimeMatch = 0; SET @ColumnTypeIsDate = 0; -- Determine if Given Value is date IF @OrigVal IS NULL RETURN NULL; -- Determine if Given Value is date IF ISDATE(@OrigVal)=1 SET @ColumnTypeIsDate = 1; -- Determine if Given Value is date (ISDATE does not recognize CAST AS DATE) IF @OrigLen=10 AND (LEN(@OrigVal) - LEN(REPLACE(@OrigVal, '-', '')))=2 AND ISNUMERIC(SUBSTRING(@OrigVal, 1, 4))=1 AND ISNUMERIC(SUBSTRING(@OrigVal, 6, 2))=1 AND ISNUMERIC(SUBSTRING(@OrigVal, 9, 2))=1 SET @ColumnTypeIsDate = 1; -- Determine if this is a date/time value IF @ColumnTypeIsDate=1 BEGIN -- DETERMINE IF THIS FIELD CONTAINS A DATE VALUE SET @DateMatch = CASE WHEN CAST(CAST(@OrigVal AS date) AS VARCHAR(10))'1900-01-01' THEN 1 ELSE 0 END; -- DETERMINE IF THIS FIELD CONTAINS A TIME VALUE SET @TimeMatch = CASE WHEN CAST(CAST(@OrigVal AS time) AS VARCHAR(10))'00:00:00.000' THEN 1 ELSE 0 END; -- DETERMINE TIME VALUE IF TIME WAS IN GIVEN VALUE IF @DateMatch=1 BEGIN -- SET YEAR RANGE (1912 - 1994: for Birthdate specifically: Older than 18 but younger than 100) SET @MinYear = DATEPART(YEAR,GETDATE())-100; -- 100 years ago from today SET @MaxYear = DATEPART(YEAR,GETDATE())-18; -- 18 years ago from today -- GET RANDOM DATE VALUES SELECT @RandYear = ((@MinYear + 1) - @MaxYear) * RandomValue + @MaxYear FROM dbo.vwRandom; SELECT @RandMonth = (1 - 12) * RandomValue + 12 FROM dbo.vwRandom; SELECT @RandDay = (1 - 28) * RandomValue + 28 FROM dbo.vwRandom; -- prevent invalid dates (specifically 31 days in feb) SET @DateVal = CAST(@RandYear AS CHAR(4)) + '-' + STUFF(@RandMonth, 1, 0, REPLICATE('0', 2 - LEN(@RandMonth))) + '-' + STUFF(@RandDay, 1, 0, REPLICATE('0', 2 - LEN(@RandDay))); END -- DETERMINE TIME VALUE IF TIME WAS IN GIVEN VALUE IF @TimeMatch=1 BEGIN -- SET RANDOM TIME VALUE (include Milliseconds) SELECT @RandHour = (0 - 23) * RandomValue + 23 FROM dbo.vwRandom; SELECT @RandMinute = (0 - 59) * RandomValue + 59 FROM dbo.vwRandom; SELECT @RandSeconds = (0 - 59) * RandomValue + 59 FROM dbo.vwRandom; SELECT @RandMSeconds = (0 - 999) * RandomValue + 999 FROM dbo.vwRandom; SET @TimeVal = STUFF(@RandHour, 1, 0, REPLICATE('0', 2 - LEN(@RandHour))) + ':' + STUFF(@RandMinute, 1, 0, REPLICATE('0', 2 - LEN(@RandMinute))) + ':' + STUFF(@RandSeconds, 1, 0, REPLICATE('0', 2 - LEN(@RandSeconds))) + '.' + STUFF(@RandMSeconds, 1, 0, REPLICATE('0', 3 - LEN(@RandMSeconds))); SET @TimeVal = CONVERT(time, @TimeVal, 14); END -- DETERMINE WHICH DATE PARTS WERE SUBMITTED IF @TimeMatch=1 AND @DateMatch=0 SET @NewVal = @TimeVal; ELSE IF @TimeMatch=0 AND @DateMatch=1 SET @NewVal = @DateVal; ELSE IF @TimeMatch=1 AND @DateMatch=1 SET @NewVal = CAST(@DateVal AS varchar(10)) + ' ' + CAST(@TimeVal AS varchar(16)); ELSE SET @NewVal = ''; -- REDUCE THIS STRING TO THE SAME LENGTH AS SUBMITTED VALUE SET @NewVal = SUBSTRING(LTRIM(RTRIM(CAST(@NewVal AS VARCHAR(30)))), 1, LEN(@OrigVal)); END ELSE BEGIN -- Loop through the characters passed WHILE @LoopCt
- -- Drop the function if it already exists
- IF OBJECT_ID ('ufn_DataJumble', 'FN') IS NOT NULL
- DROP FUNCTION ufn_DataJumble ;
- GO
- -- Create user defined function
- CREATE FUNCTION ufn_DataJumble
- (
- @OrigVal varchar(max)
- )
- RETURNS varchar(max)
- WITH ENCRYPTION
- AS
- BEGIN
- -- Variables used
- DECLARE @NewVal varchar(max);
- DECLARE @OrigLen int;
- DECLARE @CurrLen int;
- DECLARE @LoopCt int;
- DECLARE @Rand int;
- -- for dates/times
- DECLARE @MinYear int;
- DECLARE @MaxYear int;
- DECLARE @RandYear int;
- DECLARE @RandMonth int;
- DECLARE @RandDay int;
- DECLARE @RandHour int;
- DECLARE @RandMinute int;
- DECLARE @RandSeconds int;
- DECLARE @RandMSeconds int;
- DECLARE @DateVal date;
- DECLARE @TimeVal time;
- DECLARE @DateMatch int;
- DECLARE @TimeMatch int;
- DECLARE @ColumnTypeIsDate int;
- -- Format given value (prevents CHAR datatype issue)
- SET @OrigVal = LTRIM(RTRIM(@OrigVal));
- -- Set variable default values
- SET @NewVal = '';
- SET @OrigLen = DATALENGTH(@OrigVal);
- SET @CurrLen = @OrigLen;
- SET @LoopCt = 1;
- SET @DateVal = ''; -- 1900-01-01
- SET @TimeVal = ''; -- 00:00:00.000
- SET @DateMatch = 0;
- SET @TimeMatch = 0;
- SET @ColumnTypeIsDate = 0;
- -- Determine if Given Value is date
- IF @OrigVal IS NULL
- RETURN null;
- -- Determine if Given Value is date
- IF ISDATE(@OrigVal)=1
- SET @ColumnTypeIsDate = 1;
- -- Determine if Given Value is date (ISDATE does not recognize CAST AS DATE)
- IF @OrigLen=10
- AND (LEN(@OrigVal) - LEN(REPLACE(@OrigVal, '-', '')))=2
- AND ISNUMERIC(SUBSTRING(@OrigVal, 1, 4))=1
- AND ISNUMERIC(SUBSTRING(@OrigVal, 6, 2))=1
- AND ISNUMERIC(SUBSTRING(@OrigVal, 9, 2))=1
- SET @ColumnTypeIsDate = 1;
- -- Determine if this is a date/time value
- IF @ColumnTypeIsDate=1
- BEGIN
- -- DETERMINE IF THIS FIELD CONTAINS A DATE VALUE
- SET @DateMatch =
- CASE
- WHEN CAST(CAST(@OrigVal AS date) AS VARCHAR(10))'1900-01-01' THEN 1
- ELSE 0
- END;
- -- DETERMINE IF THIS FIELD CONTAINS A TIME VALUE
- SET @TimeMatch =
- CASE
- WHEN CAST(CAST(@OrigVal AS time) AS VARCHAR(10))'00:00:00.000' THEN 1
- ELSE 0
- END;
- -- DETERMINE TIME VALUE IF TIME WAS IN GIVEN VALUE
- IF @DateMatch=1
- BEGIN
- -- SET YEAR RANGE (1912 - 1994: for Birthdate specifically: Older than 18 but younger than 100)
- SET @MinYear = DATEPART(YEAR,GETDATE())-100; -- 100 years ago from today
- SET @MaxYear = DATEPART(YEAR,GETDATE())-18; -- 18 years ago from today
- -- GET RANDOM DATE VALUES
- SELECT @RandYear = ((@MinYear + 1) - @MaxYear) * RandomValue + @MaxYear FROM dbo.vwRandom;
- SELECT @RandMonth = (1 - 12) * RandomValue + 12 FROM dbo.vwRandom;
- SELECT @RandDay = (1 - 28) * RandomValue + 28 FROM dbo.vwRandom; -- prevent invalid dates (specifically 31 days in feb)
- SET @DateVal =
- CAST(@RandYear AS CHAR(4)) + '-'
- + STUFF(@RandMonth, 1, 0, REPLICATE('0', 2 - LEN(@RandMonth))) + '-'
- + STUFF(@RandDay, 1, 0, REPLICATE('0', 2 - LEN(@RandDay)));
- END
- -- DETERMINE TIME VALUE IF TIME WAS IN GIVEN VALUE
- IF @TimeMatch=1
- BEGIN
- -- SET RANDOM TIME VALUE (include Milliseconds)
- SELECT @RandHour = (0 - 23) * RandomValue + 23 FROM dbo.vwRandom;
- SELECT @RandMinute = (0 - 59) * RandomValue + 59 FROM dbo.vwRandom;
- SELECT @RandSeconds = (0 - 59) * RandomValue + 59 FROM dbo.vwRandom;
- SELECT @RandMSeconds = (0 - 999) * RandomValue + 999 FROM dbo.vwRandom;
- SET @TimeVal =
- STUFF(@RandHour, 1, 0, REPLICATE('0', 2 - LEN(@RandHour)))
- + ':' + STUFF(@RandMinute, 1, 0, REPLICATE('0', 2 - LEN(@RandMinute)))
- + ':' + STUFF(@RandSeconds, 1, 0, REPLICATE('0', 2 - LEN(@RandSeconds)))
- + '.' + STUFF(@RandMSeconds, 1, 0, REPLICATE('0', 3 - LEN(@RandMSeconds)));
- SET @TimeVal = CONVERT(time, @TimeVal, 14);
- END
- -- DETERMINE WHICH DATE PARTS WERE SUBMITTED
- IF @TimeMatch=1 AND @DateMatch=0
- SET @NewVal = @TimeVal;
- ELSE IF @TimeMatch=0 AND @DateMatch=1
- SET @NewVal = @DateVal;
- ELSE IF @TimeMatch=1 AND @DateMatch=1
- SET @NewVal = CAST(@DateVal AS varchar(10)) + ' ' + CAST(@TimeVal AS varchar(16));
- ELSE
- SET @NewVal = '';
- -- REDUCE THIS STRING TO THE SAME LENGTH AS SUBMITTED VALUE
- SET @NewVal = SUBSTRING(LTRIM(RTRIM(CAST(@NewVal AS VARCHAR(30)))), 1, LEN(@OrigVal));
- END
- ELSE
- BEGIN
- -- Loop through the characters passed
- WHILE @LoopCt
Usage
SELECT Common.ufn_DataJumble(Forenames) AS Forenames , Common.ufn_DataJumble(Surname) AS Surnames , Common.ufn_DataJumble(AccountNumber) AS AccountNumber , Common.ufn_DataJumble(DateOfBirth) AS DateOfBirth FROM mySchema.mySensitiveTable;
- SELECT
- Common.ufn_DataJumble(Forenames) AS Forenames
- , Common.ufn_DataJumble(Surname) AS Surnames
- , Common.ufn_DataJumble(AccountNumber) AS AccountNumber
- , Common.ufn_DataJumble(DateOfBirth) AS DateOfBirth
- FROM
- mySchema.mySensitiveTable;
UPDATE Student.PersonExtract SET Surname=Common.ufn_DataJumble(Surname);
- UPDATE Student.PersonExtract
- SET Surname=Common.ufn_DataJumble(Surname);
Known Issues (fixed in above function)
- "Invalid length parameter passed to the LEFT or SUBSTRING function"
- Add LTRIM(RTRIM(Forenames)) if Forenames is a char datatype.
- DateOfBirth is interpreted as string (isdate does not recognize datatype date)
- CAST(Common.ufn_DataJumble(CAST(DateOfBirth AS DATETIME)) AS DATE) AS DateOfBirth
Still To Do
Create stored procedure that will take any table and return the table with scrambled data:
- Needs to determine datatype
- Cannot exceed allowed length (shouldn't happen if simply scrambling data)
- Function needs to be reduced in size
Stored Procedure (using this function)
So that I can give it any table name and it will maintain structure and data types:
USE UnitTestDataStaging; GO -- Drop Stored Procedure if already exists IF OBJECT_ID ( 'Common.ufn_DataJumble', 'P' ) IS NOT NULL DROP PROCEDURE Common.ufn_DataJumble; GO -- Create Stored Procedure CREATE PROCEDURE Common.ufn_DataJumble @SchemaTableName nvarchar(100) AS BEGIN -- Variables used DECLARE @TableName varchar(50); DECLARE @SqlToExecute nvarchar(4000); -- Set variable default values SET @TableName = SUBSTRING(@SchemaTableName, CHARINDEX('.', @SchemaTableName)+1, LEN(@SchemaTableName)); SET @SqlToExecute = ''; -- Generate Select Query to extract data from given table SET @SqlToExecute='SELECT ' + CHAR(10) + STUFF( ( -- Code to concatenate column names and data types into one string SELECT ',Common.ufn_DataJumble(ltrim(rtrim(' + COLUMN_NAME+ '))) AS ' + COLUMN_NAME + CHAR(10) FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = @TableName ORDER BY ORDINAL_POSITION FOR XML PATH ('') ),1,1,'' ) + ' FROM ' + @SchemaTableName; EXEC(@SqlToExecute); END GO -- execute stored procedure on a table EXECUTE Common.ufn_DataJumble 'dbo.myTable';
- USE UnitTestDataStaging;
- GO
- -- Drop Stored Procedure if already exists
- IF OBJECT_ID ( 'Common.ufn_DataJumble', 'P' ) IS NOT NULL
- DROP PROCEDURE Common.ufn_DataJumble;
- GO
- -- Create Stored Procedure
- CREATE PROCEDURE Common.ufn_DataJumble
- @SchemaTableName nvarchar(100)
- AS
- BEGIN
- -- Variables used
- DECLARE @TableName varchar(50);
- DECLARE @SqlToExecute nvarchar(4000);
- -- Set variable default values
- SET @TableName = SUBSTRING(@SchemaTableName, CHARINDEX('.', @SchemaTableName)+1, LEN(@SchemaTableName));
- SET @SqlToExecute = '';
- -- Generate Select Query to extract data from given table
- SET @SqlToExecute='SELECT ' + CHAR(10) + STUFF(
- (
- -- Code to concatenate column names and data types into one string
- SELECT
- ',Common.ufn_DataJumble(ltrim(rtrim(' + COLUMN_NAME+ '))) AS ' + COLUMN_NAME + CHAR(10)
- FROM
- INFORMATION_SCHEMA.COLUMNS
- WHERE
- TABLE_NAME = @TableName
- ORDER BY ORDINAL_POSITION
- FOR XML PATH ('')
- ),1,1,''
- ) + ' FROM ' + @SchemaTableName;
- EXEC(@SqlToExecute);
- END
- GO
- -- execute stored procedure on a table
- EXECUTE Common.ufn_DataJumble 'dbo.myTable';